Coverage for nltk.corpus.reader.bracket_parse : 58%
![](keybd_closed.png)
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Natural Language Toolkit: Penn Treebank Reader # # Copyright (C) 2001-2012 NLTK Project # Author: Steven Bird <sb@ldc.upenn.edu> # Edward Loper <edloper@gradient.cis.upenn.edu> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT Corpus reader for corpora that consist of parenthesis-delineated parse trees. """
# we use [^\s()]+ instead of \S+? to avoid matching ()
""" Reader for corpora that consist of parenthesis-delineated parse trees. """ detect_blocks='unindented_paren', encoding=None, tag_mapping_function=None): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param comment_char: The character which can appear at the start of a line to indicate that the rest of the line is a comment. :param detect_blocks: The method that is used to find blocks in the corpus; can be 'unindented_paren' (every unindented parenthesis starts a new parse) or 'sexpr' (brackets are matched). """
return read_sexpr_block(stream, comment_char=self._comment_char) return read_blankline_block(stream) # Tokens start with unindented left parens. # Strip any comments out of the tokens. toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), '', tok) for tok in toks] else: assert 0, 'bad block type'
# If there's an empty set of brackets surrounding the actual # parse, then strip them off. # Replace leaves of the form (!), (,), with (! !), (, ,) # Replace leaves of the form (tag word root) with (tag word)
except ValueError as e: sys.stderr.write("Bad tree detected; trying to recover...\n") # Try to recover, if we can: if e.args == ('mismatched parens',): for n in range(1, 5): try: v = Tree.parse(self._normalize(t+')'*n)) sys.stderr.write(" Recovered by adding %d close " "paren(s)\n" % n) return v except ValueError: pass # Try something else: sys.stderr.write(" Recovered by returning a flat parse.\n") #sys.stderr.write(' '.join(t.split())+'\n') return Tree('S', self._tag(t))
tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))] if simplify_tags: tagged_sent = [(w, self._tag_mapping_function(t)) for (w,t) in tagged_sent] return tagged_sent
""" Reader for the Alpino Dutch Treebank. """ detect_blocks='blankline', encoding=encoding, tag_mapping_function=tag_mapping_function)
if t[:10] != "<alpino_ds": return "" # convert XML to sexpr notation t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t) t = re.sub(r" </node>", r")", t) t = re.sub(r"<sentence>.*</sentence>", r"", t) t = re.sub(r"</?alpino_ds.*>", r"", t) return t |