Coverage for nltk.corpus.reader.nombank : 32%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: NomBank Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Authors: Paul Bedaride <paul.bedaride@gmail.com> # Edward Loper <edloper@gradient.cis.upenn.edu> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
""" Corpus reader for the nombank corpus, which augments the Penn Treebank with information about the predicate argument structure of every noun instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-noun basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ nounsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding=None): """ :param root: The root directory for this corpus. :param nomfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by nombank. """ # If framefiles is specified as a regexp, expand it. # Initialze the corpus reader. encoding)
# Record our frame fileids & nom file.
""" :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
""" :return: a corpus view that acts as a list of ``NombankInstance`` objects, one for each noun in the corpus. """ return StreamBackedCorpusView(self.abspath(self._nomfile), self._read_instance_block, encoding=self.encoding(self._nomfile))
""" :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView(self.abspath(self._nomfile), read_line_block, encoding=self.encoding(self._nomfile))
""" :return: the xml description for the given roleset. """ lemma = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % lemma if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
""" :return: a corpus view that acts as a list of all noun lemmas in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView(self.abspath(self._nounsfile), read_line_block, encoding=self.encoding(self._nounsfile))
block = []
# Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: block.append(NombankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus))
return block
###################################################################### #{ Nombank Instance & related datatypes ######################################################################
predicate, predid, arguments, parse_corpus=None):
self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence."""
self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero."""
self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements."""
self.baseform = baseform """The baseform of the predicate."""
self.sensenumber = sensenumber """The sense number os the predicate"""
self.predicate = predicate """A ``NombankTreePointer`` indicating the position of this instance's predicate within its containing sentence."""
self.predid = predid """Identifier of the predicate """
self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate."""
self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this nombank corpus."""
def roleset(self): """The name of the roleset used by this instance's predicate. Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to look up information about the roleset.""" return '%s.%s'%(self.baseform, self.sensenumber)
return ('<NombankInstance: %s, sent %s, word %s>' % (self.fileid, self.sentnum, self.wordnum))
s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, self.basename, self.sensenumber) items = self.arguments + ((self.predicate, 'rel'),) for (argloc, argid) in sorted(items): s += ' %s-%s' % (argloc, argid) return s
if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] The parse tree corresponding to this instance, or None if the corresponding tree is not available.""")
pieces = s.split() if len(pieces) < 6: raise ValueError('Badly formatted nombank line: %r' % s)
# Divide the line into its basic pieces. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
args = pieces[5:] rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p] if len(rel) != 1: raise ValueError('Badly formatted nombank line: %r' % s)
# Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum)
# Parse the predicate location.
predloc, predid = rel[0].split('-', 1) predicate = NombankTreePointer.parse(predloc)
# Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split('-', 1) arguments.append( (NombankTreePointer.parse(argloc), argid) )
# Put it all together. return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus)
""" A pointer used by nombank to identify one or more constituents in a parse tree. ``NombankPointer`` is an abstract base class with three concrete subclasses:
- ``NombankTreePointer`` is used to point to single constituents. - ``NombankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``NombankTreePointer`` pointers. - ``NombankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. """ if self.__class__ == NombankPoitner: raise NotImplementedError()
self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``NombankSplitTreePointer`` or ``NombankTreePointer`` pointers."""
return '*'.join('%s' % p for p in self.pieces) return '<NombankChainTreePointer: %s>' % self if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``NombankTreePointer`` pointers."""
return ','.join('%s' % p for p in self.pieces) return '<NombankSplitTreePointer: %s>' % self if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
""" wordnum:height*wordnum:height*... wordnum:height,
""" self.wordnum = wordnum self.height = height
def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split('*') if len(pieces) > 1: return NombankChainTreePointer([NombankTreePointer.parse(elt) for elt in pieces])
# Deal with split args (xx,yy,zz) pieces = s.split(',') if len(pieces) > 1: return NombankSplitTreePointer([NombankTreePointer.parse(elt) for elt in pieces])
# Deal with normal pointers. pieces = s.split(':') if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s) return NombankTreePointer(int(pieces[0]), int(pieces[1]))
return '%s:%s' % (self.wordnum, self.height)
return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0]
if not isinstance(other, NombankTreePointer): return cmp(id(self), id(other))
return cmp( (self.wordnum, -self.height), (other.wordnum, -other.height) )
if tree is None: raise ValueError('Parse tree not avaialable') return tree[self.treepos(tree)]
""" Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError('Parse tree not avaialable') stack = [tree] treepos = []
wordnum = 0 while True: #print treepos #print stack[-1] # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[:len(treepos)-self.height-1]) else: wordnum += 1 stack.pop()
|