Coverage for nltk.tokenize.api: 54%

# Natural Language Toolkit: Tokenizer Interface

# Author: Edward Loper <edloper@gradient.cis.upenn.edu>

# Steven Bird <sb@csse.unimelb.edu.au>

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

Tokenizer Interface

"""

from nltk.internals import overridden

from nltk.tokenize.util import string_span_tokenize

class TokenizerI(object):

"""

A processing interface for tokenizing a string.

Subclasses must define ``tokenize()`` or ``batch_tokenize()`` (or both).

"""

def tokenize(self, s):

"""

Return a tokenized copy of *s*.

:rtype: list of str

"""

if overridden(self.batch_tokenize):

return self.batch_tokenize([s])[0]

else:

raise NotImplementedError()

def span_tokenize(self, s):

"""

Identify the tokens using integer offsets ``(start_i, end_i)``,

where ``s[start_i:end_i]`` is the corresponding token.

:rtype: iter(tuple(int, int))

"""

raise NotImplementedError()

def batch_tokenize(self, strings):

"""

Apply ``self.tokenize()`` to each element of ``strings``. I.e.:

return [self.tokenize(s) for s in strings]

:rtype: list(list(str))

"""

return [self.tokenize(s) for s in strings]

def batch_span_tokenize(self, strings):

"""

Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:

return [self.span_tokenize(s) for s in strings]

:rtype: iter(list(tuple(int, int)))

"""

for s in strings:

yield list(self.span_tokenize(s))

class StringTokenizer(TokenizerI):

"""A tokenizer that divides a string into substrings by splitting

on the specified string (defined in subclasses).

"""

def tokenize(self, s):

return s.split(self._string)

def span_tokenize(self, s):

for span in string_span_tokenize(s, self._string):

yield span

if __name__ == "__main__":

import doctest

doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

Coverage for nltk.tokenize.api : 54%

24 statements 13 run 11 missing 0 excluded