Coverage for nltk.corpus.reader.chasen : 63%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# # Copyright (C) 2001-2012 NLTK Project # Author: Masato Hagiwara <hagisan@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
return concat([ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
return concat([ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
return concat([ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
return concat([ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
return concat([ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)])
""" A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """
tagged, group_by_sent, group_by_para, sent_splitter=None):
"""Reads one paragraph at a time."""
if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = []
sent = [w for (w,t) in sent]
para.append(sent) else:
block.append(para) else:
import nltk from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print('/'.join( jeita.words()[22100:22140] ))
print('\nEOS\n'.join(['\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173]]))
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
demo() test() |