Coverage for nltk.corpus.reader.conll : 22%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: CONLL Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Steven Bird <sb@ldc.upenn.edu> # Edward Loper <edloper@gradient.cis.upenn.edu> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
Read CoNLL-style chunk fileids. """
""" A corpus reader for CoNLL-style files. These files consist of a series of sentences, separated by blank lines. Each sentence is encoded using a table (or "grid") of values, where each line corresponds to a single word, and each column corresponds to an annotation type. The set of columns used by CoNLL-style files can vary from corpus to corpus; the ``ConllCorpusReader`` constructor therefore takes an argument, ``columntypes``, which is used to specify the columns that are used by a given corpus.
@todo: Add support for reading from corpora where different parallel files contain different columns. @todo: Possibly add caching of the grid corpus view? This would allow the same grid view to be used by different data access methods (eg words() and parsed_sents() could both share the same grid corpus view object). @todo: Better support for -DOCSTART-. Currently, we just ignore it, but it could be used to define methods that retrieve a document at a time (eg parsed_documents()). """
#///////////////////////////////////////////////////////////////// # Column Types #/////////////////////////////////////////////////////////////////
#: A list of all column types supported by the conll corpus reader.
#///////////////////////////////////////////////////////////////// # Constructor #/////////////////////////////////////////////////////////////////
chunk_types=None, top_node='S', pos_in_tree=False, srl_includes_roleset=True, encoding=None, tree_class=Tree, tag_mapping_function=None): raise ValueError('Bad column type %r' % columntype) chunk_types = [chunk_types]
#///////////////////////////////////////////////////////////////// # Data Access Methods #/////////////////////////////////////////////////////////////////
if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
self._require(self.WORDS) return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
self._require(self.WORDS) return LazyMap(self._get_words, self._grids(fileids))
self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, simplify_tags) return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, simplify_tags) return LazyMap(get_tagged_words, self._grids(fileids))
simplify_tags=False): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, simplify_tags) return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
simplify_tags=False): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, simplify_tags) return LazyMap(get_chunked_words, self._grids(fileids))
self._require(self.WORDS, self.POS, self.TREE) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_parsed_sent(grid): # capture pos_in_tree as local var return self._get_parsed_sent(grid, pos_in_tree, simplify_tags) return LazyMap(get_parsed_sent, self._grids(fileids))
self._require(self.SRL) return LazyMap(self._get_srl_spans, self._grids(fileids))
self._require(self.WORDS, self.POS, self.TREE, self.SRL) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_srl_instances(grid): # capture pos_in_tree as local var return self._get_srl_instances(grid, pos_in_tree) result = LazyMap(get_srl_instances, self._grids(fileids)) if flatten: result = LazyConcatenation(result) return result
""" :return: a list of word/tag/IOB tuples :rtype: list(tuple) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, simplify_tags) return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
""" :return: a list of lists of word/tag/IOB tuples :rtype: list(list) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, simplify_tags) return LazyMap(get_iob_words, self._grids(fileids))
#///////////////////////////////////////////////////////////////// # Grid Reading #/////////////////////////////////////////////////////////////////
# n.b.: we could cache the object returned here (keyed on # fileids), which would let us reuse the same corpus view for # different things (eg srl and parse trees). return concat([StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)])
grids = [] for block in read_blankline_block(stream): block = block.strip() if not block: continue
grid = [line.split() for line in block.split('\n')]
# If there's a docstart row, then discard. ([xx] eventually it # would be good to actually use it) if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': del grid[0]
# Check that the grid is consistent. for row in grid: if len(row) != len(grid[0]): raise ValueError('Inconsistent number of columns:\n%s' % block) grids.append(grid) return grids
#///////////////////////////////////////////////////////////////// # Transforms #///////////////////////////////////////////////////////////////// # given a grid, transform it into some representation (e.g., # a list of words or a parse tree).
return self._get_column(grid, self._colmap['words'])
pos_tags = self._get_column(grid, self._colmap['pos']) if simplify_tags: pos_tags = [self._tag_mapping_function(t) for t in pos_tags] return zip(self._get_column(grid, self._colmap['words']), pos_tags)
pos_tags = self._get_column(grid, self._colmap['pos']) if simplify_tags: pos_tags = [self._tag_mapping_function(t) for t in pos_tags] return zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap['chunk']))
# n.b.: this method is very similar to conllstr2tree. words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if simplify_tags: pos_tags = [self._tag_mapping_function(t) for t in pos_tags] chunk_tags = self._get_column(grid, self._colmap['chunk'])
stack = [Tree(self._top_node, [])]
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): if chunk_tag == 'O': state, chunk_type = 'O', '' else: (state, chunk_type) = chunk_tag.split('-') # If it's a chunk we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = 'O' # Treat a mismatching I like a B. if state == 'I' and chunk_type != stack[-1].node: state = 'B' # For B or I: close any open chunks if state in 'BO' and len(stack) == 2: stack.pop() # For B: start a new chunk. if state == 'B': new_chunk = Tree(chunk_type, []) stack[-1].append(new_chunk) stack.append(new_chunk) # Add the word token. stack[-1].append((word, pos_tag))
return stack[0]
words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if simplify_tags: pos_tags = [self._tag_mapping_function(t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap['tree'])
treestr = '' for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == '(': word = '-LRB-' if word == ')': word = '-RRB-' if pos_tag == '(': pos_tag = '-LRB-' if pos_tag == ')': pos_tag = '-RRB-' (left, right) = parse_tag.split('*') right = right.count(')')*')' # only keep ')'. treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) try: tree = self._tree_class.parse(treestr) except (ValueError, IndexError): tree = self._tree_class.parse('(%s %s)' % (self._top_node, treestr))
if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if (isinstance(child, Tree) and len(child)==1 and isinstance(child[0], compat.string_types)): subtree[i] = (child[0], child.node)
return tree
""" list of list of (start, end), tag) tuples """ if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap['srl']+1) start_col = self._colmap['srl']+2 else: predicates = self._get_column(grid, self._colmap['srl']) start_col = self._colmap['srl']+1
# Count how many predicates there are. This tells us how many # columns to expect for SRL data. num_preds = len([p for p in predicates if p != '-'])
spanlists = [] for i in range(num_preds): col = self._get_column(grid, start_col+i) spanlist = [] stack = [] for wordnum, srl_tag in enumerate(col): (left, right) = srl_tag.split('*') for tag in left.split('('): if tag: stack.append((tag, wordnum)) for i in range(right.count(')')): (tag, start) = stack.pop() spanlist.append( ((start, wordnum+1), tag) ) spanlists.append(spanlist)
return spanlists
tree = self._get_parsed_sent(grid, pos_in_tree) spanlists = self._get_srl_spans(grid) if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap['srl']+1) rolesets = self._get_column(grid, self._colmap['srl']) else: predicates = self._get_column(grid, self._colmap['srl']) rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree) for wordnum, predicate in enumerate(predicates): if predicate == '-': continue # Decide which spanlist to use. Don't assume that they're # sorted in the same order as the predicates (even though # they usually are). for spanlist in spanlists: for (start, end), tag in spanlist: if wordnum in range(start,end) and tag in ('V', 'C-V'): break else: continue break else: raise ValueError('No srl column found for %r' % predicate) instances.append(ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist))
return instances
#///////////////////////////////////////////////////////////////// # Helper Methods #/////////////////////////////////////////////////////////////////
for columntype in columntypes: if columntype not in self._colmap: raise ValueError('This corpus does not contain a %s ' 'column.' % columntype)
def _get_column(grid, column_index): return [grid[i][column_index] for i in range(len(grid))]
""" An SRL instance from a CoNLL corpus, which identifies and providing labels for the arguments of a single verb. """ # [xx] add inst.core_arguments, inst.argm_arguments?
self.verb = [] """A list of the word indices of the words that compose the verb whose arguments are identified by this instance. This will contain multiple word indices when multi-word verbs are used (e.g. 'turn on')."""
self.verb_head = verb_head """The word index of the head word of the verb whose arguments are identified by this instance. E.g., for a sentence that uses the verb 'turn on,' ``verb_head`` will be the word index of the word 'turn'."""
self.verb_stem = verb_stem
self.roleset = roleset
self.arguments = [] """A list of ``(argspan, argid)`` tuples, specifying the location and type for each of the arguments identified by this instance. ``argspan`` is a tuple ``start, end``, indicating that the argument consists of the ``words[start:end]``."""
self.tagged_spans = tagged_spans """A list of ``(span, id)`` tuples, specifying the location and type for each of the arguments, as well as the verb pieces, that make up this instance."""
self.tree = tree """The parse tree for the sentence containing this instance."""
self.words = tree.leaves() """A list of the words in the sentence containing this instance."""
# Fill in the self.verb and self.arguments values. for (start, end), tag in tagged_spans: if tag in ('V', 'C-V'): self.verb += list(range(start, end)) else: self.arguments.append( ((start, end), tag) )
plural = len(self.arguments)!=1 and 's' or '' return '<ConllSRLInstance for %r with %d argument%s>' % ( (self.verb_stem, len(self.arguments), plural))
verbstr = ' '.join(self.words[i][0] for i in self.verb) hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem) s = '' for i, word in enumerate(self.words): if isinstance(word, tuple): word = word[0] for (start, end), argid in self.arguments: if i == start: s += '[%s ' % argid if i == end: s += '] ' if i in self.verb: word = '<<%s>>' % word s += word + ' ' return hdr + textwrap.fill(s.replace(' ]', ']'), initial_indent=' ', subsequent_indent=' ')
""" Set of instances for a single sentence """ self.tree = tree list.__init__(self, instances)
return self.pprint()
# Sanity check: trees should be the same for inst in self: if inst.tree != self.tree: raise ValueError('Tree mismatch!')
# If desired, add trees: if include_tree: words = self.tree.leaves() pos = [None] * len(words) synt = ['*'] * len(words) self._tree2conll(self.tree, 0, words, pos, synt)
s = '' for i in range(len(words)): # optional tree columns if include_tree: s += '%-20s ' % words[i] s += '%-8s ' % pos[i] s += '%15s*%-8s ' % tuple(synt[i].split('*'))
# verb head column for inst in self: if i == inst.verb_head: s += '%-20s ' % inst.verb_stem break else: s += '%-20s ' % '-' # Remaining columns: self for inst in self: argstr = '*' for (start, end), argid in inst.tagged_spans: if i==start: argstr = '(%s%s' % (argid, argstr) if i==(end-1): argstr += ')' s += '%-12s ' % argstr s += '\n' return s
assert isinstance(tree, Tree) if len(tree) == 1 and isinstance(tree[0], compat.string_types): pos[wordnum] = tree.node assert words[wordnum] == tree[0] return wordnum+1 elif len(tree) == 1 and isinstance(tree[0], tuple): assert len(tree[0]) == 2 pos[wordnum], pos[wordnum] = tree[0] return wordnum+1 else: synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum]) for child in tree: wordnum = self._tree2conll(child, wordnum, words, pos, synt) synt[wordnum-1] += ')' return wordnum
""" A ConllCorpusReader whose data file contains three columns: words, pos, and chunk. """ tag_mapping_function=None): self, root, fileids, ('words', 'pos', 'chunk'), chunk_types=chunk_types, encoding=encoding, tag_mapping_function=tag_mapping_function)
|