Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Aligned Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # URL: <http://www.nltk.org/> # Author: Steven Bird <sb@csse.unimelb.edu.au> # For license information, see LICENSE.TXT
read_alignedsent_block
""" Reader for corpora of word-aligned sentences. Tokens are assumed to be separated by whitespace. Sentences begin on separate lines. """ sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding=None): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage:
>>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt')
:param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """
""" :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
""" :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat([AlignedSentCorpusView(fileid, enc, False, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
""" :return: the given file(s) as a list of AlignedSent objects. :rtype: list(AlignedSent) """ self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)])
""" A specialized corpus view for aligned sentences. ``AlignedSentCorpusView`` objects are typically created by ``AlignedCorpusReader`` (not directly by nltk users). """ word_tokenizer, sent_tokenizer, alignedsent_block_reader):
for alignedsent_str in self._alignedsent_block_reader(stream) for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)] block = [block[0]] else:
|