Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

# Natural Language Toolkit: Plaintext Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Nitin Madnani <nmadnani@umiacs.umd.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A reader for corpora that consist of plaintext documents. 

""" 

 

import codecs 

 

from nltk import compat 

import nltk.data 

from nltk.tokenize import * 

 

from .util import * 

from .api import * 

 

class PlaintextCorpusReader(CorpusReader): 

    """ 

    Reader for corpora that consist of plaintext documents.  Paragraphs 

    are assumed to be split using blank lines.  Sentences and words can 

    be tokenized using the default tokenizers, or by custom tokenizers 

    specificed as parameters to the constructor. 

 

    This corpus reader can be customized (e.g., to skip preface 

    sections of specific document formats) by creating a subclass and 

    overriding the ``CorpusView`` class variable. 

    """ 

 

    CorpusView = StreamBackedCorpusView 

    """The corpus view class used by this reader.  Subclasses of 

       ``PlaintextCorpusReader`` may specify alternative corpus view 

       classes (e.g., to skip the preface sections of documents.)""" 

 

    def __init__(self, root, fileids, 

                 word_tokenizer=WordPunctTokenizer(), 

                 sent_tokenizer=nltk.data.LazyLoader( 

                     'tokenizers/punkt/english.pickle'), 

                 para_block_reader=read_blankline_block, 

                 encoding=None): 

        """ 

        Construct a new plaintext corpus reader for a set of documents 

        located at the given root directory.  Example usage: 

 

            >>> root = '/usr/local/share/nltk_data/corpora/webtext/' 

            >>> reader = PlaintextCorpusReader(root, '.*\.txt') 

 

        :param root: The root directory for this corpus. 

        :param fileids: A list or regexp specifying the fileids in this corpus. 

        :param word_tokenizer: Tokenizer for breaking sentences or 

            paragraphs into words. 

        :param sent_tokenizer: Tokenizer for breaking paragraphs 

            into words. 

        :param para_block_reader: The block reader used to divide the 

            corpus into paragraph blocks. 

        """ 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self._word_tokenizer = word_tokenizer 

        self._sent_tokenizer = sent_tokenizer 

        self._para_block_reader = para_block_reader 

 

    def raw(self, fileids=None, sourced=False): 

        """ 

        :return: the given file(s) as a single string. 

        :rtype: str 

        """ 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f, sourced).read() for f in fileids]) 

 

    def words(self, fileids=None, sourced=False): 

        """ 

        :return: the given file(s) as a list of words 

            and punctuation symbols. 

        :rtype: list(str) 

        """ 

        # Once we require Python 2.5, use source=(fileid if sourced else None) 

        if sourced: 

            return concat([self.CorpusView(path, self._read_word_block, 

                                           encoding=enc, source=fileid) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

        else: 

            return concat([self.CorpusView(path, self._read_word_block, 

                                           encoding=enc) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

 

 

    def sents(self, fileids=None, sourced=False): 

        """ 

        :return: the given file(s) as a list of 

            sentences or utterances, each encoded as a list of word 

            strings. 

        :rtype: list(list(str)) 

        """ 

        if self._sent_tokenizer is None: 

            raise ValueError('No sentence tokenizer for this corpus') 

        if sourced: 

            return concat([self.CorpusView(path, self._read_sent_block, 

                                           encoding=enc, source=fileid) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

        else: 

            return concat([self.CorpusView(path, self._read_sent_block, 

                                           encoding=enc) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

 

 

    def paras(self, fileids=None, sourced=False): 

        """ 

        :return: the given file(s) as a list of 

            paragraphs, each encoded as a list of sentences, which are 

            in turn encoded as lists of word strings. 

        :rtype: list(list(list(str))) 

        """ 

        if self._sent_tokenizer is None: 

            raise ValueError('No sentence tokenizer for this corpus') 

        if sourced: 

            return concat([self.CorpusView(path, self._read_para_block, 

                                           encoding=enc, source=fileid) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

        else: 

            return concat([self.CorpusView(path, self._read_para_block, 

                                           encoding=enc) 

                           for (path, enc, fileid) 

                           in self.abspaths(fileids, True, True)]) 

 

    def _read_word_block(self, stream): 

        words = [] 

        for i in range(20): # Read 20 lines at a time. 

            words.extend(self._word_tokenizer.tokenize(stream.readline())) 

        return words 

 

    def _read_sent_block(self, stream): 

        sents = [] 

        for para in self._para_block_reader(stream): 

            sents.extend([self._word_tokenizer.tokenize(sent) 

                          for sent in self._sent_tokenizer.tokenize(para)]) 

        return sents 

 

    def _read_para_block(self, stream): 

        paras = [] 

        for para in self._para_block_reader(stream): 

            paras.append([self._word_tokenizer.tokenize(sent) 

                          for sent in self._sent_tokenizer.tokenize(para)]) 

        return paras 

 

 

class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, 

                                    PlaintextCorpusReader): 

    """ 

    A reader for plaintext corpora whose documents are divided into 

    categories based on their file identifiers. 

    """ 

    def __init__(self, *args, **kwargs): 

        """ 

        Initialize the corpus reader.  Categorization arguments 

        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to 

        the ``CategorizedCorpusReader`` constructor.  The remaining arguments 

        are passed to the ``PlaintextCorpusReader`` constructor. 

        """ 

        CategorizedCorpusReader.__init__(self, kwargs) 

        PlaintextCorpusReader.__init__(self, *args, **kwargs) 

 

    def _resolve(self, fileids, categories): 

        if fileids is not None and categories is not None: 

            raise ValueError('Specify fileids or categories, not both') 

        if categories is not None: 

            return self.fileids(categories) 

        else: 

            return fileids 

    def raw(self, fileids=None, categories=None): 

        return PlaintextCorpusReader.raw( 

            self, self._resolve(fileids, categories)) 

    def words(self, fileids=None, categories=None): 

        return PlaintextCorpusReader.words( 

            self, self._resolve(fileids, categories)) 

    def sents(self, fileids=None, categories=None): 

        return PlaintextCorpusReader.sents( 

            self, self._resolve(fileids, categories)) 

    def paras(self, fileids=None, categories=None): 

        return PlaintextCorpusReader.paras( 

            self, self._resolve(fileids, categories)) 

 

# is there a better way? 

class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader): 

    def __init__(self, *args, **kwargs): 

        CategorizedCorpusReader.__init__(self, kwargs) 

        kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle') 

        PlaintextCorpusReader.__init__(self, *args, **kwargs) 

 

class EuroparlCorpusReader(PlaintextCorpusReader): 

 

    """ 

    Reader for Europarl corpora that consist of plaintext documents. 

    Documents are divided into chapters instead of paragraphs as 

    for regular plaintext documents. Chapters are separated using blank 

    lines. Everything is inherited from ``PlaintextCorpusReader`` except 

    that: 

      - Since the corpus is pre-processed and pre-tokenized, the 

        word tokenizer should just split the line at whitespaces. 

      - For the same reason, the sentence tokenizer should just 

        split the paragraph at line breaks. 

      - There is a new 'chapters()' method that returns chapters instead 

        instead of paragraphs. 

      - The 'paras()' method inherited from PlaintextCorpusReader is 

        made non-functional to remove any confusion between chapters 

        and paragraphs for Europarl. 

    """ 

 

    def _read_word_block(self, stream): 

        words = [] 

        for i in range(20): # Read 20 lines at a time. 

            words.extend(stream.readline().split()) 

        return words 

 

    def _read_sent_block(self, stream): 

        sents = [] 

        for para in self._para_block_reader(stream): 

            sents.extend([sent.split() for sent in para.splitlines()]) 

        return sents 

 

    def _read_para_block(self, stream): 

        paras = [] 

        for para in self._para_block_reader(stream): 

            paras.append([sent.split() for sent in para.splitlines()]) 

        return paras 

 

    def chapters(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            chapters, each encoded as a list of sentences, which are 

            in turn encoded as lists of word strings. 

        :rtype: list(list(list(str))) 

        """ 

        return concat([self.CorpusView(fileid, self._read_para_block, 

                                       encoding=enc) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def paras(self, fileids=None): 

        raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')