Coverage for nltk.corpus.reader.verbnet : 31%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Verbnet Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Edward Loper <edloper@gradient.cis.upenn.edu> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
# No unicode encoding param, since the data files are all XML.
"""A dictionary mapping from verb lemma strings to lists of verbnet class identifiers."""
"""A dictionary mapping from wordnet identifier strings to lists of verbnet class identifiers."""
"""A dictionary mapping from class identifiers to corresponding file identifiers. The keys of this dictionary provide a complete list of all classes and subclasses."""
# Initialize the dictionaries. Use the quick (regexp-based) # method instead of the slow (xml-based) method, because it # runs 2-30 times faster.
"""Regular expression that matches (and decomposes) longids"""
"""Regular expression that matches shortids"""
r'<VNSUBCLASS ID="([^"]+)"/?>') """Regular expression used by ``_index()`` to quickly scan the corpus for basic information."""
""" Return a list of all verb lemmas that appear in any class, or in the ``classid`` if specified. """ if classid is None: return sorted(self._lemma_to_class.keys()) else: # [xx] should this include subclass members? vnclass = self.vnclass(classid) return [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')]
""" Return a list of all wordnet identifiers that appear in any class, or in ``classid`` if specified. """ if classid is None: return sorted(self._wordnet_to_class.keys()) else: # [xx] should this include subclass members? vnclass = self.vnclass(classid) return sum([member.get('wn','').split() for member in vnclass.findall('MEMBERS/MEMBER')], [])
""" Return a list of the verbnet class identifiers. If a file identifier is specified, then return only the verbnet class identifiers for classes (and subclasses) defined by that file. If a lemma is specified, then return only verbnet class identifiers for classes that contain that lemma as a member. If a wordnetid is specified, then return only identifiers for classes that contain that wordnetid as a member. If a classid is specified, then return only identifiers for subclasses of the specified verbnet class. """ if len([x for x in [lemma, wordnetid, fileid, classid] if x is not None]) > 1: raise ValueError('Specify at most one of: fileid, wordnetid, ' 'fileid, classid') if fileid is not None: return [c for (c,f) in self._class_to_fileid.items() if f == fileid] elif lemma is not None: return self._lemma_to_class[lemma] elif wordnetid is not None: return self._wordnet_to_class[wordnetid] elif classid is not None: xmltree = self.vnclass(classid) return [subclass.get('ID') for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS')] else: return sorted(self._class_to_fileid.keys())
""" Return an ElementTree containing the xml for the specified verbnet class.
:param fileid_or_classid: An identifier specifying which class should be returned. Can be a file identifier (such as ``'put-9.1.xml'``), or a verbnet class identifier (such as ``'put-9.1'``) or a short verbnet class identifier (such as ``'9.1'``). """ # File identifier: just return the xml. if fileid_or_classid in self._fileids: return self.xml(fileid_or_classid)
# Class identifier: get the xml, and find the right elt. classid = self.longid(fileid_or_classid) if classid in self._class_to_fileid: fileid = self._class_to_fileid[self.longid(classid)] tree = self.xml(fileid) if classid == tree.get('ID'): return tree else: for subclass in tree.findall('.//VNSUBCLASS'): if classid == subclass.get('ID'): return subclass else: assert False # we saw it during _index()!
else: raise ValueError('Unknown identifier %s' % fileid_or_classid)
""" Return a list of fileids that make up this corpus. If ``vnclass_ids`` is specified, then return the fileids that make up the specified verbnet class(es). """ if vnclass_ids is None: return self._fileids elif isinstance(vnclass_ids, compat.string_types): return [self._class_to_fileid[self.longid(vnclass_ids)]] else: return [self._class_to_fileid[self.longid(vnclass_id)] for vnclass_id in vnclass_ids]
###################################################################### #{ Index Initialization ######################################################################
""" Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This is fast with cElementTree (<0.1 secs), but quite slow (>10 secs) with the python implementation of ElementTree. """ for fileid in self._fileids: self._index_helper(self.xml(fileid), fileid)
"""Helper for ``_index()``""" vnclass = xmltree.get('ID') self._class_to_fileid[vnclass] = fileid self._shortid_to_longid[self.shortid(vnclass)] = vnclass for member in xmltree.findall('MEMBERS/MEMBER'): self._lemma_to_class[member.get('name')].append(vnclass) for wn in member.get('wn', '').split(): self._wordnet_to_class[wn].append(vnclass) for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'): self._index_helper(subclass, fileid)
""" Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This doesn't do proper xml parsing, but is good enough to find everything in the standard verbnet corpus -- and it runs about 30 times faster than xml parsing (with the python ElementTree; only 2-3 times faster with cElementTree). """ # nb: if we got rid of wordnet_to_class, this would run 2-3 # times faster. else: assert False, 'unexpected match condition'
###################################################################### #{ Identifier conversion ######################################################################
"""Given a short verbnet class identifier (eg '37.10'), map it to a long id (eg 'confess-37.10'). If ``shortid`` is already a long id, then return it as-is""" if self._LONGID_RE.match(shortid): return shortid # it's already a longid. elif not self._SHORTID_RE.match(shortid): raise ValueError('vnclass identifier %r not found' % shortid) try: return self._shortid_to_longid[shortid] except KeyError: raise ValueError('vnclass identifier %r not found' % shortid)
"""Given a long verbnet class identifier (eg 'confess-37.10'), map it to a short id (eg '37.10'). If ``longid`` is already a short id, then return it as-is.""" return longid # it's already a shortid. else: raise ValueError('vnclass identifier %r not found' % longid)
###################################################################### #{ Pretty Printing ######################################################################
""" Return a string containing a pretty-printed representation of the given verbnet class.
:param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass)
s = vnclass.get('ID') + '\n' s += self.pprint_subclasses(vnclass, indent=' ') + '\n' s += self.pprint_members(vnclass, indent=' ') + '\n' s += ' Thematic roles:\n' s += self.pprint_themroles(vnclass, indent=' ') + '\n' s += ' Frames:\n' s += '\n'.join(self.pprint_frame(vnframe, indent=' ') for vnframe in vnclass.findall('FRAMES/FRAME')) return s
""" Return a string containing a pretty-printed representation of the given verbnet class's subclasses.
:param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass)
subclasses = [subclass.get('ID') for subclass in vnclass.findall('SUBCLASSES/VNSUBCLASS')] if not subclasses: subclasses = ['(none)'] s = 'Subclasses: ' + ' '.join(subclasses) return textwrap.fill(s, 70, initial_indent=indent, subsequent_indent=indent+' ')
""" Return a string containing a pretty-printed representation of the given verbnet class's member verbs.
:param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass)
members = [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')] if not members: members = ['(none)'] s = 'Members: ' + ' '.join(members) return textwrap.fill(s, 70, initial_indent=indent, subsequent_indent=indent+' ')
""" Return a string containing a pretty-printed representation of the given verbnet class's thematic roles.
:param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass)
pieces = [] for themrole in vnclass.findall('THEMROLES/THEMROLE'): piece = indent + '* ' + themrole.get('type') modifiers = ['%(Value)s%(type)s' % restr.attrib for restr in themrole.findall('SELRESTRS/SELRESTR')] if modifiers: piece += '[%s]' % ' '.join(modifiers) pieces.append(piece)
return '\n'.join(pieces)
""" Return a string containing a pretty-printed representation of the given verbnet frame.
:param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ s = self.pprint_description(vnframe, indent) + '\n' s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n' s += indent + ' Semantics:\n' s += self.pprint_semantics(vnframe, indent+' ') return s
""" Return a string containing a pretty-printed representation of the given verbnet frame description.
:param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ descr = vnframe.find('DESCRIPTION') s = indent + descr.attrib['primary'] if descr.get('secondary', ''): s += ' (%s)' % descr.get('secondary') return s
""" Return a string containing a pretty-printed representation of the given verbnet frame syntax.
:param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ pieces = [] for elt in vnframe.find('SYNTAX'): piece = elt.tag modifiers = [] if 'value' in elt.attrib: modifiers.append(elt.get('value')) modifiers += ['%(Value)s%(type)s' % restr.attrib for restr in (elt.findall('SELRESTRS/SELRESTR') + elt.findall('SYNRESTRS/SYNRESTR'))] if modifiers: piece += '[%s]' % ' '.join(modifiers) pieces.append(piece)
return indent + ' '.join(pieces)
""" Return a string containing a pretty-printed representation of the given verbnet frame semantics.
:param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ pieces = [] for pred in vnframe.findall('SEMANTICS/PRED'): args = [arg.get('value') for arg in pred.findall('ARGS/ARG')] pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args))) return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])
|