Coverage for nltk.corpus.reader.senseval: 17%

100

101

102

103

# Natural Language Toolkit: Senseval 2 Corpus Reader

# Author: Trevor Cohn <tacohn@cs.mu.oz.au>

# Steven Bird <sb@csse.unimelb.edu.au> (modifications)

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]

Evaluation exercises for Word Sense Disambiguation.

Organized by ACL-SIGLEX [http://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,

http://www.d.umn.edu/~tpederse/data.html

Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.

Each instance of the ambiguous words "hard", "interest", "line", and "serve"

is tagged with a sense identifier, and supplied with context.

"""

from __future__ import print_function

import re

from xml.etree import ElementTree

from nltk import compat

from nltk.tokenize import *

from .util import *

from .api import *

class SensevalInstance(object):

def __init__(self, word, position, context, senses):

self.word = word

self.senses = tuple(senses)

self.position = position

self.context = context

def __repr__(self):

return ('SensevalInstance(word=%r, position=%r, '

'context=%r, senses=%r)' %

(self.word, self.position, self.context, self.senses))

class SensevalCorpusReader(CorpusReader):

def instances(self, fileids=None):

return concat([SensevalCorpusView(fileid, enc)

for (fileid, enc) in self.abspaths(fileids, True)])

def raw(self, fileids=None):

"""

:return: the text contents of the given fileids, as a single string.

"""

if fileids is None: fileids = self._fileids

elif isinstance(fileids, compat.string_types): fileids = [fileids]

return concat([self.open(f).read() for f in fileids])

def _entry(self, tree):

elts = []

for lexelt in tree.findall('lexelt'):

for inst in lexelt.findall('instance'):

sense = inst[0].attrib['senseid']

context = [(w.text, w.attrib['pos'])

for w in inst[1]]

elts.append( (sense, context) )

return elts

class SensevalCorpusView(StreamBackedCorpusView):

def __init__(self, fileid, encoding):

StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)

self._word_tokenizer = WhitespaceTokenizer()

self._lexelt_starts = [0] # list of streampos

self._lexelts = [None] # list of lexelt names

def read_block(self, stream):

# Decide which lexical element we're in.

lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1

lexelt = self._lexelts[lexelt_num]

instance_lines = []

in_instance = False

while True:

line = stream.readline()

if line == '':

assert instance_lines == []

return []

# Start of a lexical element?

if line.lstrip().startswith('<lexelt'):

lexelt_num += 1

m = re.search('item=("[^"]+"|\'[^\']+\')', line)

assert m is not None # <lexelt> has no 'item=...'

lexelt = m.group(1)[1:-1]

if lexelt_num < len(self._lexelts):

assert lexelt == self._lexelts[lexelt_num]

else:

self._lexelts.append(lexelt)

self._lexelt_starts.append(stream.tell())

# Start of an instance?

if line.lstrip().startswith('<instance'):

assert instance_lines == []

in_instance = True

# Body of an instance?

if in_instance:

instance_lines.append(line)

# End of an instance?

if line.lstrip().startswith('</instance'):

xml_block = '\n'.join(instance_lines)

xml_block = _fixXML(xml_block)

inst = ElementTree.fromstring(xml_block)

return [self._parse_instance(inst, lexelt)]

def _parse_instance(self, instance, lexelt):

senses = []

context = []

position = None

for child in instance:

if child.tag == 'answer':

senses.append(child.attrib['senseid'])

elif child.tag == 'context':

context += self._word_tokenizer.tokenize(child.text)

for cword in child:

if cword.tag == 'compound':

cword = cword[0] # is this ok to do?

if cword.tag == 'head':

# Some santiy checks:

assert position is None, 'head specified twice'

assert cword.text.strip() or len(cword)==1

assert not (cword.text.strip() and len(cword)==1)

# Record the position of the head:

position = len(context)

# Addd on the head word itself:

if cword.text.strip():

context.append(cword.text.strip())

elif cword[0].tag == 'wf':

context.append((cword[0].text,

cword[0].attrib['pos']))

if cword[0].tail:

context += self._word_tokenizer.tokenize(

cword[0].tail)

else:

assert False, 'expected CDATA or wf in <head>'

elif cword.tag == 'wf':

context.append((cword.text, cword.attrib['pos']))

elif cword.tag == 's':

pass # Sentence boundary marker.

else:

print('ACK', cword.tag)

assert False, 'expected CDATA or <wf> or <head>'

if cword.tail:

context += self._word_tokenizer.tokenize(cword.tail)

else:

assert False, 'unexpected tag %s' % child.tag

return SensevalInstance(lexelt, position, context, senses)

def _fixXML(text):

"""

Fix the various issues with Senseval pseudo-XML.

"""

# <~> or <^> => ~ or ^

text = re.sub(r'<([~\^])>', r'\1', text)

# fix lone &

text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)

# fix """

text = re.sub(r'"""', '\'"\'', text)

# fix <s snum=dd> => <s snum="dd"/>

text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)

# fix foreign word tag

text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)

# remove <&I .>

text = re.sub(r'<\&I[^>]*>', '', text)

# fix <{word}>

text = re.sub(r'<{([^}]+)}>', r'\1', text)

# remove <@>, <p>, </p>

text = re.sub(r'<(@|/?p)>', r'', text)

# remove <&M .> and <&T .> and <&Ms .>

text = re.sub(r'<&\w+ \.>', r'', text)

# remove <!DOCTYPE... > lines

text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)

# remove <[hi]> and <[/p]> etc

text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)

# take the thing out of the brackets: <…>

text = re.sub(r'<(\&\w+;)>', r'\1', text)

# and remove the & for those patterns that aren't regular XML

text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)

# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>

text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>',

r' <wf pos="\2">\1</wf>', text)

text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)

return text

Coverage for nltk.corpus.reader.senseval : 17%

117 statements 20 run 97 missing 0 excluded