Coverage for nltk.corpus.reader.dependency: 66%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Natural Language Toolkit: Dependency Corpus Reader

#

# Copyright (C) 2001-2012 NLTK Project

# Author: Kepa Sarasola <kepa.sarasola@ehu.es>

# Iker Manterola <returntothehangar@hotmail.com>

#

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

from nltk.parse import DependencyGraph

from nltk.tokenize import *

from .util import *

from .api import *

class DependencyCorpusReader(SyntaxCorpusReader):

def __init__(self, root, fileids, encoding=None,

word_tokenizer=TabTokenizer(),

sent_tokenizer=RegexpTokenizer('\n', gaps=True),

para_block_reader=read_blankline_block):

CorpusReader.__init__(self, root, fileids, encoding)

#########################################################

def raw(self, fileids=None):

"""

:return: the given file(s) as a single string.

:rtype: str

"""

return concat([open(fileid).read()

for fileid in self.abspaths(fileids)])

def words(self, fileids=None):

return concat([DependencyCorpusView(fileid, False, False, False)

for fileid in self.abspaths(fileids)])

def tagged_words(self, fileids=None):

return concat([DependencyCorpusView(fileid, True, False, False)

for fileid in self.abspaths(fileids)])

def sents(self, fileids=None):

return concat([DependencyCorpusView(fileid, False, True, False)

for fileid in self.abspaths(fileids)])

def tagged_sents(self, fileids=None):

return concat([DependencyCorpusView(fileid, True, True, False)

for fileid in self.abspaths(fileids)])

def parsed_sents(self, fileids=None):

sents=concat([DependencyCorpusView(fileid, False, True, True)

for fileid in self.abspaths(fileids)])

return [DependencyGraph(sent) for sent in sents]

class DependencyCorpusView(StreamBackedCorpusView):

_DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da

def __init__(self, corpus_file, tagged, group_by_sent, dependencies,

chunk_types=None):

self._tagged = tagged

self._dependencies = dependencies

self._group_by_sent = group_by_sent

self._chunk_types = chunk_types

StreamBackedCorpusView.__init__(self, corpus_file)

def read_block(self, stream):

# Read the next sentence.

sent = read_blankline_block(stream)[0].strip()

# Strip off the docstart marker, if present.

if sent.startswith(self._DOCSTART):

sent = sent[len(self._DOCSTART):].lstrip()

# extract word and tag from any of the formats

if not self._dependencies:

lines = [line.split('\t') for line in sent.split('\n')]

if len(lines[0]) == 3 or len(lines[0]) == 4:

sent = [(line[0], line[1]) for line in lines]

elif len(lines[0]) == 10:

sent = [(line[1], line[4]) for line in lines]

else:

raise ValueError('Unexpected number of fields in dependency tree file')

# discard tags if they weren't requested

if not self._tagged:

sent = [word for (word, tag) in sent]

# Return the result.

if self._group_by_sent:

return [sent]

else:

return list(sent)