Coverage for nltk.corpus.reader.knbc : 41%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#! /usr/bin/env python # KNB Corpus reader # Copyright (C) 2001-2012 NLTK Project # Author: Masato Hagiwara <hagisan@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
# default function to convert morphlist to str for tree representation
""" This class implements: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences.
The structure of tagged words: tagged_word = (word(str), tags(tuple)) tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) """
""" Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """
# blocks are split by blankline (or EOF) - default
# ignore the Bunsets headers
# ignore the Bunsets headers # convert cells to morph tuples
dg = DependencyGraph() i = 0 for line in t.splitlines(): if line.startswith("*") or line.startswith("+"): # start of bunsetsu or tag
cells = line.strip().split(" ", 3) m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodelist[i] node['address'] = i node['rel'] = m.group(2) # dep_type
node['word'] = []
dep_parent = int(m.group(1))
while len(dg.nodelist) < i+1 or len(dg.nodelist) < dep_parent+1: dg.nodelist.append({'word':[], 'deps':[]})
if dep_parent == -1: dg.root = node else: dg.nodelist[dep_parent]['deps'].append(i)
i += 1 elif not line.startswith("#"): # normal morph cells = line.strip().split(" ") # convert cells to morph tuples morph = ( cells[0], ' '.join(cells[1:]) ) dg.nodelist[i-1]['word'].append(morph)
if self.morphs2str: for node in dg.nodelist: node['word'] = self.morphs2str(node['word'])
return dg.tree()
###################################################################### # Demo ######################################################################
import nltk from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]
def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')
print(knbc.fileids()[:10]) print(''.join( knbc.words()[:100] ))
print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))
knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8')
print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))
print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ))
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader( 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') assert isinstance(knbc.words()[0], compat.string_types) assert isinstance(knbc.sents()[0][0], compat.string_types) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple)
demo() # test() |