Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

# Natural Language Toolkit: Chunked Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A reader for corpora that contain chunked (and optionally tagged) 

documents. 

""" 

 

import os.path, codecs 

 

import nltk 

from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 

from nltk import compat 

from nltk.tree import Tree 

from nltk.tokenize import * 

from nltk.chunk import tagstr2tree 

from .util import * 

from .api import * 

 

class ChunkedCorpusReader(CorpusReader): 

    """ 

    Reader for chunked (and optionally tagged) corpora.  Paragraphs 

    are split using a block reader.  They are then tokenized into 

    sentences using a sentence tokenizer.  Finally, these sentences 

    are parsed into chunk trees using a string-to-chunktree conversion 

    function.  Each of these steps can be performed using a default 

    function or a custom function.  By default, paragraphs are split 

    on blank lines; sentences are listed one per line; and sentences 

    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. 

    """ 

    def __init__(self, root, fileids, extension='', 

                 str2chunktree=tagstr2tree, 

                 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 

                 para_block_reader=read_blankline_block, 

                 encoding=None): 

        """ 

        :param root: The root directory for this corpus. 

        :param fileids: A list or regexp specifying the fileids in this corpus. 

        """ 

        CorpusReader.__init__(self, root, fileids, encoding) 

 

        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader) 

        """Arguments for corpus views generated by this corpus: a tuple 

        (str2chunktree, sent_tokenizer, para_block_tokenizer)""" 

 

    def raw(self, fileids=None): 

        """ 

        :return: the given file(s) as a single string. 

        :rtype: str 

        """ 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def words(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of words 

            and punctuation symbols. 

        :rtype: list(str) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def sents(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            sentences or utterances, each encoded as a list of word 

            strings. 

        :rtype: list(list(str)) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def paras(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            paragraphs, each encoded as a list of sentences, which are 

            in turn encoded as lists of word strings. 

        :rtype: list(list(list(str))) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def tagged_words(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of tagged 

            words and punctuation symbols, encoded as tuples 

            ``(word,tag)``. 

        :rtype: list(tuple(str,str)) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def tagged_sents(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            sentences, each encoded as a list of ``(word,tag)`` tuples. 

 

        :rtype: list(list(tuple(str,str))) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def tagged_paras(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            paragraphs, each encoded as a list of sentences, which are 

            in turn encoded as lists of ``(word,tag)`` tuples. 

        :rtype: list(list(list(tuple(str,str)))) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def chunked_words(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of tagged 

            words and chunks.  Words are encoded as ``(word, tag)`` 

            tuples (if the corpus has tags) or word strings (if the 

            corpus has no tags).  Chunks are encoded as depth-one 

            trees over ``(word,tag)`` tuples or word strings. 

        :rtype: list(tuple(str,str) and Tree) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def chunked_sents(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            sentences, each encoded as a shallow Tree.  The leaves 

            of these trees are encoded as ``(word, tag)`` tuples (if 

            the corpus has tags) or word strings (if the corpus has no 

            tags). 

        :rtype: list(Tree) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def chunked_paras(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            paragraphs, each encoded as a list of sentences, which are 

            in turn encoded as a shallow Tree.  The leaves of these 

            trees are encoded as ``(word, tag)`` tuples (if the corpus 

            has tags) or word strings (if the corpus has no tags). 

        :rtype: list(list(Tree)) 

        """ 

        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args) 

                       for (f, enc) in self.abspaths(fileids, True)]) 

 

    def _read_block(self, stream): 

        return [tagstr2tree(t) for t in read_blankline_block(stream)] 

 

class ChunkedCorpusView(StreamBackedCorpusView): 

    def __init__(self, fileid, encoding, tagged, group_by_sent, 

                 group_by_para, chunked, str2chunktree, sent_tokenizer, 

                 para_block_reader): 

        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) 

        self._tagged = tagged 

        self._group_by_sent = group_by_sent 

        self._group_by_para = group_by_para 

        self._chunked = chunked 

        self._str2chunktree = str2chunktree 

        self._sent_tokenizer = sent_tokenizer 

        self._para_block_reader = para_block_reader 

 

    def read_block(self, stream): 

        block = [] 

        for para_str in self._para_block_reader(stream): 

            para = [] 

            for sent_str in self._sent_tokenizer.tokenize(para_str): 

                sent = self._str2chunktree(sent_str) 

 

                # If requested, throw away the tags. 

                if not self._tagged: 

                    sent = self._untag(sent) 

 

                # If requested, throw away the chunks. 

                if not self._chunked: 

                    sent = sent.leaves() 

 

                # Add the sentence to `para`. 

                if self._group_by_sent: 

                    para.append(sent) 

                else: 

                    para.extend(sent) 

 

            # Add the paragraph to `block`. 

            if self._group_by_para: 

                block.append(para) 

            else: 

                block.extend(para) 

 

        # Return the block 

        return block 

 

    def _untag(self, tree): 

        for i, child in enumerate(tree): 

            if isinstance(child, Tree): 

                self._untag(child) 

            elif isinstance(child, tuple): 

                tree[i] = child[0] 

            else: 

                raise ValueError('expected child to be Tree or tuple') 

        return tree