Coverage for nltk.sourcedstring : 84%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Sourced Strings # # Copyright (C) 2001-2009 NLTK Project # Author: Edward Loper <edloper@gmail.com> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
"Sourced strings" are strings that are annotated with information about the location in a document where they were originally found. Sourced strings are subclassed from Python strings. As a result, they can usually be used anywhere a normal Python string can be used.
>>> from nltk.sourcedstring import SourcedString >>> newt_contents = ''' ... She turned me into a newt! ... I got better.'''.strip() >>> newt_doc = SourcedString(newt_contents, 'newt.txt') >>> newt_doc 'She turned me into a newt!\\nI got better.'@[0:40] >>> newt = newt_doc.split()[5] # Find the sixth word. >>> newt 'newt!'@[21:26]
""" #from __future__ import unicode_literals
'StringSource', 'ConsecutiveCharStringSource', 'ContiguousCharStringSource', 'SourcedString', 'SourcedStringStream', 'SourcedStringRegexp', 'SimpleSourcedString', 'CompoundSourcedString', 'SimpleSourcedByteString', 'SimpleSourcedUnicodeString', 'CompoundSourcedByteString', 'CompoundSourcedUnicodeString', ]
#////////////////////////////////////////////////////////////////////// # String Sources #//////////////////////////////////////////////////////////////////////
""" A description of the location of a string in a document. Each ``StringSource`` consists of a document identifier, along with information about the begin and end offsets of each character in the string. These offsets are typically either byte offsets or character offsets. (Note that for unicode strings, byte offsets and character offsets are not the same thing.)
``StringSource`` is an abstract base class. Two concrete subclasses are used depending on the properties of the string whose source is being described:
- ``ConsecutiveCharStringSource`` describes the source of strings whose characters have consecutive offsets (in particular, byte strings w/ byte offsets; and unicode strings with character offsets).
- ``ContiguousCharStringSource`` describes the source of strings whose characters are contiguous, but do not necessarily have consecutive offsets (in particular, unicode strings with byte offsets).
:ivar docid: An identifier (such as a filename) that specifies which document contains the string.
:ivar offsets: A list of offsets specifying the location of each character in the document. The *i* th character of the string begins at offset ``offsets[i]`` and ends at offset ``offsets[i+1]``. The length of the ``offsets`` list is one greater than the list of the string described by this ``StringSource``.
:ivar begin: The document offset where the string begins. (I.e., the offset of the first character in the string.) ``source.begin`` is always equal to ``source.offsets[0]``.
:ivar end: The document offset where the string ends. (For character offsets, one plus the offset of the last character; for byte offsets, one plus the offset of the last byte that encodes the last character). ``source.end`` is always equal to ``source.offsets[-1]``. """
# If the StringSource constructor is called directly, then # choose one of its subclasses to delegate to. raise TypeError("Specifcy either begin and end, or " "offsets, using keyword arguments") 'offsets' in kwargs): else: raise TypeError("Specify either begin and end, or offsets " "(but not both)") # Construct the object.
""" Create a new ``StringSource``. When the ``StringSource`` constructor is called directly, it automatically delegates to one of its two subclasses:
- If ``begin`` and ``end`` are specified, then a ``ConsecutiveCharStringSource`` is returned. - If ``offsets`` is specified, then a ``ContiguousCharStringSource`` is returned.
In both cases, the arguments must be specified as keyword arguments (not positional arguments). """
""" Return a ``StringSource`` describing the location where the specified character was found. In particular, if ``s`` is the string that this source describes, then return a ``StringSource`` describing the location of ``s[index]``.
:raise IndexError: If index is out of range. """ else:
def __getslice__(self, start, stop): """ Return a ``StringSource`` describing the location where the specified substring was found. In particular, if ``s`` is the string that this source describes, then return a ``StringSource`` describing the location of ``s[start:stop]``. """
def __len__(self): """ Return the length of the string described by this ``StringSource``. Note that this may not be equal to ``self.end-self.begin`` for unicode strings described using byte offsets. """
else:
return (cmp(self.docid, self.docid) or cmp([(charloc.begin, charloc.end) for charloc in self], [(charloc.begin, charloc.end) for charloc in other]))
# Cache hash values. tuple((charloc.begin, charloc.end) for charloc in self)) )
""" A ``StringSource`` that specifies the source of strings whose characters have consecutive offsets. In particular, the following two properties must hold for all valid indices:
- source[i].end == source[i].begin + 1 - source[i].end == source[i+1].begin
These properties allow the source to be stored using just a start offset and an end offset (along with a docid).
This ``StringSource`` can be used to describe byte strings that are indexed using byte offsets or character offsets; or unicode strings that are indexed using character offsets. """
def offsets(self):
self.docid, self.begin+start, self.begin+stop)
cmp(self.begin, other.begin) or cmp(self.end, other.end)) else: return StringSource.__cmp__(self, other)
self.docid, self.begin, self.end)
""" A ``StringSource`` that specifies the source of strings whose character are contiguous, but do not necessarily have consecutive offsets. In particular, each character's end offset must be equal to the next character's start offset:
- source[i].end == source[i+1].begin
This property allow the source to be stored using a list of ``len(source)+1`` offsets (along with a docid).
This ``StringSource`` can be used to describe unicode strings that are indexed using byte offsets. """ raise ValueError("at least one offset must be specified")
self.docid, self.offsets[start:stop+1])
if isinstance(other, ConsecutiveCharStringSource): return (cmp(self.docid, other.docid) or cmp(self.offsets, other._offsets)) else: return StringSource.__cmp__(self, other)
#////////////////////////////////////////////////////////////////////// # Base Class for Sourced Strings. #//////////////////////////////////////////////////////////////////////
""" A string that is annotated with information about the location in a document where it was originally found. Sourced strings are subclassed from Python strings. As a result, they can usually be used anywhere a normal Python string can be used.
There are two types of sourced strings: ``SimpleSourcedString``s, which correspond to a single substring of a document; and ``CompoundSourcedString``s, which are constructed by concatenating strings from multiple sources. Each of these types has two concrete subclasses: one for unicode strings (subclassed from ``unicode``), and one for byte strings (subclassed from ``str``).
Two sourced strings are considered equal if their contents are equal, even if their sources differ. This fact is important in ensuring that sourced strings act like normal strings. In particular, it allows sourced strings to be used with code that was originally intended to process plain Python strings.
If you wish to determine whether two sourced strings came from the same location in the same document, simply compare their ``sources`` attributes. If you know that both sourced strings are ``SimpleSourcedStrings``, then you can compare their ``source`` attribute instead.
String operations that act on sourced strings will preserve location information whenever possible. However, there are a few types of string manipulation that can cause source information to be discarded. The most common examples of operations that will lose source information are:
- ``str.join()``, where the joining string is not sourced. - ``str.replace()``, where the original string is not sourced. - String formatting (the ``%`` operator). - Regular expression substitution.
:ivar sources: A sorted tuple of ``(index, source)`` pairs. Each such pair specifies that the source of ``self[index:index+len(source)]`` is ``source``. Any characters for which no source is specified are sourceless (e.g., plain Python characters that were concatenated to a sourced string).
When working with simple sourced strings, it's usually easier to use the ``source`` attribute instead; however, the ``sources`` attribute is defined for both simple and compound sourced strings. """ # If the SourcedString constructor is called directly, then # choose one of its subclasses to delegate to. else: "string or a byte string")
# Create the new object using the appropriate string class's # __new__, which takes just the contents argument.
"""A class variable, defined by subclasses of ``SourcedString``, determining what type of string this class contains. Its value must be either str or ``unicode``."""
#////////////////////////////////////////////////////////////////////// #{ Splitting & Stripping Methods #//////////////////////////////////////////////////////////////////////
# Check for unicode/bytestring mismatches: # Use a regexp to split self.
# Check for unicode/bytestring mismatches: # Split on whitespace use a regexp. # Split on a given string: use rfind. else:
if keepends: return self._LINE_RE.findall(self) else: return self._NEWLINE_RE.split(self)
#////////////////////////////////////////////////////////////////////// #{ String Concatenation Methods #//////////////////////////////////////////////////////////////////////
def concat(substrings): """ Return a sourced string formed by concatenating the given list of substrings. Adjacent substrings will be merged when possible.
Depending on the types and values of the supplied substrings, the concatenated string's value may be a Python string (str or ``unicode``), a ``SimpleSourcedString``, or a ``CompoundSourcedString``. """ # Flatten nested compound sourced strings, and merge adjacent # strings where possible:
# Return the concatenated string. return '' else:
return self._stringtype('') else:
return self.__mul__(other)
# Add the first element; but if sequence is empty, return an # empty string. except StopIteration: return self._stringtype('')
# Add the remaining elements, separated by self.
def __add_substring_to_list(substring, result): """ Helper for ``concat()``: add ``substring`` to the end of the list of substrings in ``result``. If ``substring`` is compound, then add its own substrings instead. Merge adjacent substrings whenever possible. Discard empty un-sourced substrings. """ # Flatten nested compound sourced strings.
# Discard empty Python substrings.
# Merge adjacent simple sourced strings (when possible). isinstance(substring, SimpleSourcedString) and result[-1].end == substring.begin and result[-1].docid == substring.docid): result[-1], substring)
# Merge adjacent Python strings. not isinstance(substring, SourcedString)):
# All other strings just get appended to the result list. else:
def __merge_simple_substrings(lhs, rhs): """ Helper for ``__add_substring_to_list()``: Merge ``lhs`` and ``rhs`` into a single simple sourced string, and return it. """ isinstance(rhs.source, ConsecutiveCharStringSource)): lhs.source.docid, lhs.source.begin, rhs.source.end) else: lhs.source.docid, lhs.source.offsets+rhs.source.offsets[1:])
#////////////////////////////////////////////////////////////////////// #{ Justification Methods #//////////////////////////////////////////////////////////////////////
fillchar * ((width-len(self)+1)/2))
return self.rjust(width, '0')
#////////////////////////////////////////////////////////////////////// #{ Replacement Methods #//////////////////////////////////////////////////////////////////////
# [xx] There's no reason in principle why this can't preserve # location information. But for now, it doesn't. return self._stringtype.__mod__(self, other)
# Check for unicode/bytestring mismatches: # Use a regexp to find all occurrences of old, and replace them w/ new.
if len(self) == 0: return self pieces = re.split(r'([\t\n])', self) result = '' offset = 0 for piece in pieces: if piece == '\t': spaces = 8 - (offset % tabsize) # Each inserted space's source is the same as the # source of the tab character that generated it. result += spaces * SourcedString(' ', piece.source) offset = 0 else: result += piece if piece == '\n': offset = 0 else: offset += len(piece) return result
# Note: str.translate() and unicode.translate() have # different interfaces. raise TypeError('The unicode version of translate() does not ' 'accept the deletechars parameter') [SourcedString(table.get(c,c), c.source) for c in self if table.get(c,c) is not None]) else: raise ValueError('translation table must be 256 characters long') [SourcedString(table[ord(c)], c.source) for c in self if c not in deletechars])
#////////////////////////////////////////////////////////////////////// #{ Unicode #//////////////////////////////////////////////////////////////////////
# Unicode string -> byte string return self.decode().encode(encoding, errors)
# Encode characters one at a time. result.append(SourcedString(char_byte, char.source)) else:
# Byte string -> unicode string.
# Decode self into a plain unicode string.
# Special case: if the resulting string has the same length # that the source string does, then we can safely assume that # each character is encoded with one byte; so we can just # reuse our source.
# Otherwise: re-encode the characters, one at a time, to # determine how long their encodings are. isinstance(self[last_byte], SourcedString)): begin=begin, end=end) else: offsets=[begin, end]) else: result.append(unicode_char)
# First byte of the next char is 1+last byte of this char. raise AssertionError("SourcedString.decode() does not support " "encodings that are not symmetric.")
def _decode_one_to_one(unicode_chars): """ Helper for ``self.decode()``. Returns a unicode-decoded version of this ``SourcedString``. ``unicode_chars`` is the unicode-decoded contents of this ``SourcedString``.
This is used in the special case where the decoded string has the same length that the source string does. As a result, we can safely assume that each character is encoded with one byte; so we can just reuse our source. E.g., this will happen when decoding an ASCII string with utf-8. """
""" Return true if the list (self,)+args contains at least one unicode string and at least one byte string. (If this is the case, then all byte strings should be converted to unicode by calling decode() before the operation is performed. You can do this automatically using ``_decode_and_call()``. """
""" If self or any of the values in args is a byte string, then convert it to unicode by calling its decode() method. Then return the result of calling self.op(*args). ``op`` is specified using a string, because if ``self`` is a byte string, then it will change type when it is decoded. """ # Make sure all args are decoded to unicode. # Make sure self is decoded to unicode. # Retry the operation.
#////////////////////////////////////////////////////////////////////// #{ Display #//////////////////////////////////////////////////////////////////////
""" Return a string containing a pretty-printed display of this sourced string.
:param vertical: If true, then the returned display string will have vertical orientation, rather than the default horizontal orientation.
:param wrap: Controls when the pretty-printed output is wrapped to the next line. If ``wrap`` is an integer, then lines are wrapped when they become longer than ``wrap``. If ``wrap`` is a string, then lines are wrapped immediately following that string. If ``wrap`` is None, then lines are never wrapped. """
getattr(c, 'end', 0)) for c in self))) raise TypeError("Expected wrap to be a sring, int, or None.")
# If the docid changed, then display the docid for the # previous segment.
# Put a cap on the beginning of sourceless strings
# Display the character.
# Decide whether we're at the end of the line or not. self[max(0,pos-len(wrap)+1):pos+1] == wrap) or (isinstance(wrap, (integer_types)) and line_len>=wrap) or pos == len(self)-1):
# Put a cap on the end of sourceless strings
# Filter out any empty output lines.
# Draw the docid line
# Draw the output lines
# Reset variables for the next line.
result = [] prev_offset = None max_digits = len(str(max(max(getattr(c, 'begin', 0), getattr(c, 'end', 0)) for c in self))) for pos, char in enumerate(self): char_begin = getattr(char, 'begin', None) char_end = getattr(char, 'end', None) char_docid = getattr(char, 'docid', None)
if char_begin is None: assert char_end is None if pos == 0: result.append('+-----+') result.append(':%s:' % self._pprint_char_repr(char).center(5)) if pos == len(self)-1: result.append('+-----+') prev_offset = None else: if char_begin != prev_offset: result.append('+-----+ %s [%s]' % ( str(char_begin).rjust(max_digits), char_docid)) result.append('|%s| %s [%s]' % ( self._pprint_char_repr(char).center(5), ' '*max_digits, char_docid)) result.append('+-----+ %s [%s]' % ( str(char_end).rjust(max_digits), char_docid)) prev_offset = char_end return '\n'.join(result)
'\a': r'\a', '\t': r'\t'}
# Decide how to represent this character. return r'\x%02x' % ord(char) else:
"""Helper for ``pprint()``: add a character to the pretty-printed output.""" # Add fillers to the offset lines.
"""Helper for ``pprint()``: add an offset marker to the pretty-printed output."""
#////////////////////////////////////////////////////////////////////// # Simple Sourced String #//////////////////////////////////////////////////////////////////////
""" A single substring of a document, annotated with information about the location in the document where it was originally found. See ``SourcedString`` for more information. """ # If the SimpleSourcedString constructor is called directly, # then choose one of its subclasses to delegate to. if cls is SimpleSourcedString: if isinstance(contents, binary_type): cls = SimpleSourcedByteString elif isinstance(contents, text_type): cls = SimpleSourcedUnicodeString else: raise TypeError("Expected 'contents' to be a unicode " "string or a byte string")
# Create the new object using the appropriate string class's # __new__, which takes just the contents argument. return cls._stringtype.__new__(cls, contents)
""" Construct a new sourced string.
:param contents: The string contents of the new sourced string. :type contents: str or unicode :param source: The source for the new string. If ``source`` is a string, then it is used to automatically construct a new ``ConsecutiveCharStringSource`` with a begin offset of ``0`` and an end offset of ``len(contents)``. Otherwise, ``source`` shoulde be a ``StringSource`` whose length matches the length of ``contents``. """ raise ValueError("Length of source (%d) must match length of " "contents (%d)" % (len(source), len(contents)))
occurred in the source document."""
def begin(self): """ The document offset where the string begins. (I.e., the offset of the first character in the string.)"""
def end(self): """The document offset where the string ends. (For character offsets, one plus the offset of the last character; for byte offsets, one plus the offset of the last byte that encodes the last character)."""
def docid(self): """ An identifier (such as a filename) that specifies the document where the string was found. """
def sources(self):
else:
else: else:
# Negative indices get handled *before* __getslice__ is # called. Restrict start/stop to be within the range of the # string, to prevent negative indices from being adjusted # twice.
self._stringtype.__getslice__(self, start, stop), self.source[start:stop])
#////////////////////////////////////////////////////////////////////// # Compound Sourced String #//////////////////////////////////////////////////////////////////////
""" A string constructed by concatenating substrings from multiple sources, and annotated with information about the locations where those substrings were originally found. See ``SourcedString`` for more information.
:ivar substrings: The tuple of substrings that compose this compound sourced string. Every compound sourced string is required to have at least two substrings; and the substrings themselves may never be CompoundSourcedStrings. """ # If the CompoundSourcedString constructor is called directly, # then choose one of its subclasses to delegate to. # Decide whether to use a unicode string or a byte string. if isinstance(substring, text_type)) else:
# Build the concatenated string using str.join(), which will # return a str or unicode object; never a sourced string.
# Create the new object using the appropriate string class's # __new__, which takes just the contents argument.
""" Construct a new compound sourced string that combines the given list of substrings.
Typically, compound sourced strings should not be constructed directly; instead, use ``SourcedString.concat()``, which flattens nested compound sourced strings, and merges adjacent substrings when possible.
:raise ValueError: If ``len(substrings) < 2`` :raise ValueError: If ``substrings`` contains any ``CompoundSourcedString``s. """ raise ValueError("CompoundSourcedString requires at least " "two substrings")
# Don't nest compound sourced strings. raise ValueError("substrings may not contain " "CompoundSourcedStrings.")
def sources(self):
else:
if index.step not in (None, 1): return self._stringtype.__getitem__(self, index) else: start, stop = slice_bounds(self, index) return self.__getslice__(start, stop) else:
# Bounds checking.
# Construct a source list for the resulting string.
# Concatentate the resulting substrings. return '' else:
return SourcedString.concat([s.capitalize() for s in self.substrings])
return SourcedString.concat([s.lower() for s in self.substrings])
return SourcedString.concat([s.upper() for s in self.substrings])
return SourcedString.concat([s.swapcase() for s in self.substrings])
return SourcedString.concat([s.encode(encoding, errors) for s in self.substrings])
index = 0 result = [] for substring in self.substrings: decoded_substring = unicode_chars[index:index+len(substring)] if isinstance(substring, SourcedString): result.append(SourcedString(decoded_substring, substring.source)) else: result.append(decoded_substring) index += len(substring) return SourcedString.concat(result)
#////////////////////////////////////////////////////////////////////// # Concrete Sourced String Classes #//////////////////////////////////////////////////////////////////////
# If any substrings have type 'str', then decode them to unicode.
#////////////////////////////////////////////////////////////////////// # Sourced String Regexp #//////////////////////////////////////////////////////////////////////
""" Wrapper for regexp pattern objects that cause the ``sub`` and ``subn`` methods to return sourced strings. """ return getattr(self.pattern, attr)
isinstance(string, SourcedString)): else:
def patch_re_module(): """ Modify the standard ``re`` module by installing new versions of the functions ``re.compile``, ``re.sub``, and ``re.subn``, causing regular expression substitutions to return ``SourcedStrings`` when called with ``SourcedStrings`` arguments.
Use this function only if necessary: it potentially affects all Python modules that use regular expressions! """ return re.compile(pattern).subn(repl, string, count)
def unpatch_re_module(): """ Restore the standard ``re`` module to its original state (undoing the work that was done by ``patch_re_module()``). """
#////////////////////////////////////////////////////////////////////// # Sourced String Stream #//////////////////////////////////////////////////////////////////////
""" Wrapper for a read-only stream that causes ``read()`` (and related methods) to return ``SourcedStringBase``. ``seek()`` and ``tell()`` are supported, but (currently) there are some restrictions on the values that may be passed to ``seek()``. """ """The underlying stream."""
"""The docid attribute for sourced strings"""
"""The current character (not byte) position"""
#///////////////////////////////////////////////////////////////// # Read methods #/////////////////////////////////////////////////////////////////
if size is None: return self._sourced_string(self.stream.read()) else: return self._sourced_string(self.stream.read(size))
else: return self._sourced_string(self.stream.readline(size))
""" Read this file's contents, decode them using this reader's encoding, and return it as a list of unicode lines.
:rtype: list(unicode) :param sizehint: Ignored. :param keepends: If false, then strip newlines. """ return self.read().splitlines(keepends)
"""Return the next decoded line from the underlying stream."""
return self.next()
"""Return self"""
"""Return self""" return self
"""Turn the given string into an sourced string, and update charpos.""" # [xx] currently we only support character offsets, not byte # offsets! self.charpos+len(contents))
#///////////////////////////////////////////////////////////////// # Pass-through methods & properties #/////////////////////////////////////////////////////////////////
def closed(self): """True if the underlying stream is closed.""" return self.stream.closed
def name(self): """The name of the underlying stream.""" return self.stream.name
def mode(self): """The mode of the underlying stream.""" return self.stream.mode
"""Close the underlying stream.""" self.stream.close()
#///////////////////////////////////////////////////////////////// # Seek and tell #/////////////////////////////////////////////////////////////////
else: raise TypeError('seek() must be called with a value that ' 'was returned by tell().') elif whence == 1: raise TypeError('Relative seek not supported for ' 'SourcedStringStream.') elif whence == 2: raise TypeError('Seek-from-end not supported for ' 'SourcedStringStream.') else: raise ValueError('Bad whence value %r' % whence)
|