Coverage for nltk.corpus.reader.tagged : 79%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Tagged Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Edward Loper <edloper@gradient.cis.upenn.edu> # Steven Bird <sb@ldc.upenn.edu> # Jacob Perkins <japerk@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
A reader for corpora whose documents contain part-of-speech-tagged words. """
""" Reader for simple part-of-speech tagged corpora. Paragraphs are assumed to be split using blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. Words are parsed using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the separator. I.e., words should have the form::
word1/tag1 word2/tag2 word3/tag3 ...
But custom separators may be specified as parameters to the constructor. Part of speech tags are case-normalized to upper case. """ sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block, encoding=None, tag_mapping_function=None): """ Construct a new Tagged Corpus reader for a set of documents located at the given root directory. Example usage:
>>> root = '/...path to corpus.../' >>> reader = TaggedCorpusReader(root, '.*', '.txt')
:param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """
""" :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
""" :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ False, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ False, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat([TaggedCorpusView(fileid, enc, False, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ if simplify_tags: tag_mapping_function = self._tag_mapping_function else: tag_mapping_function = None return concat([TaggedCorpusView(fileid, enc, True, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str))) """ tag_mapping_function = self._tag_mapping_function else: True, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ if simplify_tags: tag_mapping_function = self._tag_mapping_function else: tag_mapping_function = None return concat([TaggedCorpusView(fileid, enc, True, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)])
TaggedCorpusReader): """ A reader for part-of-speech tagged corpora whose documents are divided into categories based on their file identifiers. """ """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``TaggedCorpusReader``. """
raise ValueError('Specify fileids or categories, not both') else: return TaggedCorpusReader.raw( self, self._resolve(fileids, categories)) self, self._resolve(fileids, categories)) self, self._resolve(fileids, categories)) return TaggedCorpusReader.paras( self, self._resolve(fileids, categories)) return TaggedCorpusReader.tagged_words( self, self._resolve(fileids, categories), simplify_tags) self, self._resolve(fileids, categories), simplify_tags) return TaggedCorpusReader.tagged_paras( self, self._resolve(fileids, categories), simplify_tags)
""" A specialized corpus view for tagged documents. It can be customized via flags to divide the tagged corpus documents up by sentence or paragraph, and to include or omit part of speech tags. ``TaggedCorpusView`` objects are typically created by ``TaggedCorpusReader`` (not directly by nltk users). """ group_by_para, sep, word_tokenizer, sent_tokenizer, para_block_reader, tag_mapping_function=None):
"""Reads one paragraph at a time.""" self._word_tokenizer.tokenize(sent_str)] sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] else: block.append(para) else:
# needs to implement simplified tags """ A corpus reader for the MAC_MORPHO corpus. Each line contains a single tagged word, using '_' as a separator. Sentence boundaries are based on the end-sentence tag ('_.'). Paragraph information is not included in the corpus, so each paragraph returned by ``self.paras()`` and ``self.tagged_paras()`` contains a single sentence. """ self, root, fileids, sep='_', word_tokenizer=LineTokenizer(), sent_tokenizer=RegexpTokenizer('.*\n'), para_block_reader=self._read_block, encoding=encoding, tag_mapping_function=tag_mapping_function)
""" A corpus reader for tagged sentences that are included in the TIMIT corpus. """ self, para_block_reader=read_timit_block, *args, **kwargs)
raise NotImplementedError('use sents() instead')
raise NotImplementedError('use tagged_sents() instead') |