Coverage for nltk.corpus.reader.ieer: 35%

100

101

102

103

# Natural Language Toolkit: IEER Corpus Reader

# Author: Steven Bird <sb@csse.unimelb.edu.au>

# Edward Loper <edloper@gradient.cis.upenn.edu>

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

Corpus reader for the Information Extraction and Entity Recognition Corpus.

NIST 1999 Information Extraction: Entity Recognition Evaluation

http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm

This corpus contains the NEWSWIRE development test data for the

NIST 1999 IE-ER Evaluation. The files were taken from the

subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt

and filenames were shortened.

The corpus contains the following files: APW_19980314, APW_19980424,

APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.

"""

import codecs

import nltk

from nltk import compat

from .api import *

from .util import *

#: A dictionary whose keys are the names of documents in this corpus;

#: and whose values are descriptions of those documents' contents.

titles = {

'APW_19980314': 'Associated Press Weekly, 14 March 1998',

'APW_19980424': 'Associated Press Weekly, 24 April 1998',

'APW_19980429': 'Associated Press Weekly, 29 April 1998',

'NYT_19980315': 'New York Times, 15 March 1998',

'NYT_19980403': 'New York Times, 3 April 1998',

'NYT_19980407': 'New York Times, 7 April 1998',

}

#: A list of all documents in this corpus.

documents = sorted(titles)

class IEERDocument:

def __init__(self, text, docno=None, doctype=None,

date_time=None, headline=''):

self.text = text

self.docno = docno

self.doctype = doctype

self.date_time = date_time

self.headline = headline

def __repr__(self):

if self.headline:

headline = ' '.join(self.headline.leaves())

else:

headline = ' '.join([w for w in self.text.leaves()

if w[:1] != '<'][:12])+'...'

if self.docno is not None:

return '<IEERDocument %s: %r>' % (self.docno, headline)

else:

return '<IEERDocument: %r>' % headline

class IEERCorpusReader(CorpusReader):

"""

def raw(self, fileids=None):

if fileids is None: fileids = self._fileids

elif isinstance(fileids, compat.string_types): fileids = [fileids]

return concat([self.open(f).read() for f in fileids])

def docs(self, fileids=None):

return concat([StreamBackedCorpusView(fileid, self._read_block,

encoding=enc)

for (fileid, enc) in self.abspaths(fileids, True)])

def parsed_docs(self, fileids=None):

return concat([StreamBackedCorpusView(fileid,

self._read_parsed_block,

encoding=enc)

for (fileid, enc) in self.abspaths(fileids, True)])

def _read_parsed_block(self,stream):

# TODO: figure out while empty documents are being returned

return [self._parse(doc) for doc in self._read_block(stream)

if self._parse(doc).docno is not None]

def _parse(self, doc):

val = nltk.chunk.ieerstr2tree(doc, top_node="DOCUMENT")

if isinstance(val, dict):

return IEERDocument(**val)

else:

return IEERDocument(val)

def _read_block(self, stream):

out = []

# Skip any preamble.

while True:

line = stream.readline()

if not line: break

if line.strip() == '<DOC>': break

out.append(line)

# Read the document

while True:

line = stream.readline()

if not line: break

out.append(line)

if line.strip() == '</DOC>': break

# Return the document

return ['\n'.join(out)]

Coverage for nltk.corpus.reader.ieer : 35%

51 statements 18 run 33 missing 0 excluded