Coverage for nltk.corpus.reader.xmldocs : 74%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: XML Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Steven Bird <sb@csse.unimelb.edu.au> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT
Corpus reader for corpora whose documents are xml files.
(note -- not named 'xml' to avoid conflicting w/ standard xml package) """
# Use the c version of ElementTree, which is faster, if possible: except ImportError: from xml.etree import ElementTree
""" Corpus reader for corpora whose documents are xml files.
Note that the ``XMLCorpusReader`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. See the XML specs for more info. """
# Make sure we have exactly one file -- no concatenating XML. if fileid is None and len(self._fileids) == 1: fileid = self._fileids[0] if not isinstance(fileid, compat.string_types): raise TypeError('Expected a single file identifier string') # Read the XML in using ElementTree. elt = ElementTree.parse(self.abspath(fileid).open()).getroot() # If requested, wrap it. if self._wrap_etree: elt = ElementWrapper(elt) # Return the ElementTree element. return elt
""" Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """
elt = self.xml(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = []
for node in iterator: text = node.text if text is not None: toks = word_tokenizer.tokenize(text) out.extend(toks) return out
if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids])
""" A corpus view that selects out specified elements from an XML file, and provides a flat list-like interface for accessing them. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, but may be used by subclasses of ``XMLCorpusReader``.)
Every XML corpus view has a "tag specification", indicating what XML elements should be included in the view; and each (non-nested) element that matches this specification corresponds to one item in the view. Tag specifications are regular expressions over tag paths, where a tag path is a list of element tag names, separated by '/', indicating the ancestry of the element. Some examples:
- ``'foo'``: A top-level element whose tag is ``foo``. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent is a top-level element whose tag is ``foo``. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere in the xml tree. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, appearing anywhere in the xml tree.
The view items are generated from the selected XML elements via the method ``handle_elt()``. By default, this method returns the element as-is (i.e., as an ElementTree object); but it can be overridden, either via subclassing or via the ``elt_handler`` constructor parameter. """
#: If true, then display debugging output to stdout when reading #: blocks.
#: The number of characters read at a time by this corpus reader.
""" Create a new corpus view based on a specified XML file.
Note that the ``XMLCorpusView`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves.
:type tagspec: str :param tagspec: A tag specification, indicating what XML elements should be included in the view. Each non-nested element that matches this specification corresponds to one item in the view.
:param elt_handler: A function used to transform each element to a value for the view. If no handler is specified, then ``self.handle_elt()`` is called, which returns the element as an ElementTree object. The signature of elt_handler is::
elt_handler(elt, tagspec) -> value """
"""The tag specification for this corpus view."""
"""A dictionary mapping from file positions (as returned by ``stream.seek()`` to XML contexts. An XML context is a tuple of XML tag names, indicating which tags have not yet been closed."""
else: s = open(fileid, 'rb').readline() return 'utf-16-be' return 'utf-16-le' return 'utf-32-be' return 'utf-32-le' return 'utf-8' # No encoding found -- what should the default be?
""" Convert an element into an appropriate value for inclusion in the view. Unless overridden by a subclass or by the ``elt_handler`` constructor argument, this method simply returns ``elt``.
:return: The view value corresponding to ``elt``.
:type elt: ElementTree :param elt: The element that should be converted.
:type context: str :param context: A string composed of element tags separated by forward slashes, indicating the XML context of the given element. For example, the string ``'foo/bar/baz'`` indicates that the element is a ``baz`` element whose parent is a ``bar`` element and whose grandparent is a top-level ``foo`` element. """ return elt
#: A regular expression that matches XML fragments that do not #: contain any un-closed tags. [^<]* ( ((<!--.*?-->) | # comment (<![CDATA[.*?]]) | # raw character data (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) | # doctype decl (<[^>]*>)) # tag or PI [^<]*)* \Z""", re.DOTALL|re.VERBOSE)
#: A regular expression used to extract the tag name from a start tag, #: end tag, or empty-elt tag string.
#: A regular expression used to find all start-tags, end-tags, and #: emtpy-elt tags in an XML file. This regexp is more lenient than #: the XML spec -- e.g., it allows spaces in some places where the #: spec does not. # Include these so we can skip them: (?P<COMMENT> <!--.*?--> )| (?P<CDATA> <![CDATA[.*?]]> )| (?P<PI> <\?.*?\?> )| (?P<DOCTYPE> <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*> )| # These are the ones we actually care about: (?P<EMPTY_ELT_TAG> <\s*[^>/\?!\s][^>]*/\s*> )| (?P<START_TAG> <\s*[^>/\?!\s][^>]*> )| (?P<END_TAG> <\s*/[^>/\?!\s][^>]*> )""", re.DOTALL|re.VERBOSE)
""" Read a string from the given stream that does not contain any un-closed tags. In particular, this function first reads a block from the stream of size ``self._BLOCK_SIZE``. It then checks if that block contains an un-closed tag. If it does, then this function either backtracks to the last '<', or reads another block. """
# Read a block and add it to the fragment.
# Do we have a well-formed xml fragment?
# Do we have a fragment that will never be well-formed? pos = stream.tell() - ( len(fragment)-re.search('[<>]', fragment).end()) raise ValueError('Unexpected ">" near char %s' % pos)
# End of file? raise ValueError('Unexpected end of file: tag not closed')
# If not, then we must be in the middle of a <..tag..>. # If appropriate, backtrack to the most recent '<' # character. else: stream.seek(-(len(fragment)-last_open_bracket), 1)
# Otherwise, read another block. (i.e., return to the # top of the loop.)
""" Read from ``stream`` until we find at least one element that matches ``tagspec``, and return the result of applying ``elt_handler`` to each element found. """
# Use a stack of strings to keep track of our context:
# End of file. else: raise ValueError('Unexpected end of file')
# Process each <tag> in the xml fragment. print('%25s %s' % ('/'.join(context)[-20:], piece.group()))
# Keep context up-to-date. # Is this one of the elts we're looking for?
# sanity checks: raise ValueError('Unmatched tag </%s>' % name) raise ValueError('Unmatched tag <%s>...</%s>' % (context[-1], name)) # Is this the end of an element? # Keep context up-to-date
if re.match(tagspec, '/'.join(context)+'/'+name): elts.append((piece.group(), '/'.join(context)+'/'+name))
# If we haven't found any elements yet, then keep # looping until we do.
# If we've found at least one element, then try # backtracking to the start of the element that we're # inside of. else: # take back the last start-tag, and return what # we've gotten so far (elts is non-empty). print(' '*36+'(backtrack)') else: stream.seek(-(len(xml_fragment)-elt_start), 1)
# Update the _tag_context dict. else:
elt.encode('ascii', 'xmlcharrefreplace')), context) for (elt, context) in elts]
|