Coverage for nltk.corpus.reader.dependency : 66%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Dependency Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Kepa Sarasola <kepa.sarasola@ehu.es> # Iker Manterola <returntothehangar@hotmail.com> # # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
word_tokenizer=TabTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block):
#########################################################
""" :return: the given file(s) as a single string. :rtype: str """ return concat([open(fileid).read() for fileid in self.abspaths(fileids)])
return concat([DependencyCorpusView(fileid, False, False, False) for fileid in self.abspaths(fileids)])
return concat([DependencyCorpusView(fileid, True, False, False) for fileid in self.abspaths(fileids)])
return concat([DependencyCorpusView(fileid, False, True, False) for fileid in self.abspaths(fileids)])
return concat([DependencyCorpusView(fileid, True, True, False) for fileid in self.abspaths(fileids)])
for fileid in self.abspaths(fileids)])
chunk_types=None):
# Read the next sentence. # Strip off the docstart marker, if present. sent = sent[len(self._DOCSTART):].lstrip()
# extract word and tag from any of the formats lines = [line.split('\t') for line in sent.split('\n')] if len(lines[0]) == 3 or len(lines[0]) == 4: sent = [(line[0], line[1]) for line in lines] elif len(lines[0]) == 10: sent = [(line[1], line[4]) for line in lines] else: raise ValueError('Unexpected number of fields in dependency tree file')
# discard tags if they weren't requested if not self._tagged: sent = [word for (word, tag) in sent]
# Return the result. else: return list(sent) |