Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

# Natural Language Toolkit: API for Corpus Readers 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

API for corpus readers. 

""" 

 

import os 

import re 

from collections import defaultdict 

 

from nltk import compat 

from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 

from nltk.sourcedstring import SourcedStringStream 

 

from .util import * 

 

class CorpusReader(object): 

    """ 

    A base class for "corpus reader" classes, each of which can be 

    used to read a specific corpus format.  Each individual corpus 

    reader instance is used to read a specific corpus, consisting of 

    one or more files under a common root directory.  Each file is 

    identified by its ``file identifier``, which is the relative path 

    to the file from the root directory. 

 

    A separate subclass is be defined for each corpus format.  These 

    subclasses define one or more methods that provide 'views' on the 

    corpus contents, such as ``words()`` (for a list of words) and 

    ``parsed_sents()`` (for a list of parsed sentences).  Called with 

    no arguments, these methods will return the contents of the entire 

    corpus.  For most corpora, these methods define one or more 

    selection arguments, such as ``fileids`` or ``categories``, which can 

    be used to select which portion of the corpus should be returned. 

    """ 

    def __init__(self, root, fileids, encoding=None, tag_mapping_function=None): 

        """ 

        :type root: PathPointer or str 

        :param root: A path pointer identifying the root directory for 

            this corpus.  If a string is specified, then it will be 

            converted to a ``PathPointer`` automatically. 

        :param fileids: A list of the files that make up this corpus. 

            This list can either be specified explicitly, as a list of 

            strings; or implicitly, as a regular expression over file 

            paths.  The absolute path for each file will be constructed 

            by joining the reader's root to each file name. 

        :param encoding: The default unicode encoding for the files 

            that make up the corpus.  The value of ``encoding`` can be any 

            of the following: 

            - A string: ``encoding`` is the encoding name for all files. 

            - A dictionary: ``encoding[file_id]`` is the encoding 

              name for the file whose identifier is ``file_id``.  If 

              ``file_id`` is not in ``encoding``, then the file 

              contents will be processed using non-unicode byte strings. 

            - A list: ``encoding`` should be a list of ``(regexp, encoding)`` 

              tuples.  The encoding for a file whose identifier is ``file_id`` 

              will be the ``encoding`` value for the first tuple whose 

              ``regexp`` matches the ``file_id``.  If no tuple's ``regexp`` 

              matches the ``file_id``, the file contents will be processed 

              using non-unicode byte strings. 

            - None: the file contents of all files will be 

              processed using non-unicode byte strings. 

        :param tag_mapping_function: A function for normalizing or 

                simplifying the POS tags returned by the tagged_words() 

                or tagged_sents() methods. 

        """ 

        # Convert the root to a path pointer, if necessary. 

        if isinstance(root, compat.string_types): 

            m = re.match('(.*\.zip)/?(.*)$|', root) 

            zipfile, zipentry = m.groups() 

            if zipfile: 

                root = ZipFilePathPointer(zipfile, zipentry) 

            else: 

                root = FileSystemPathPointer(root) 

        elif not isinstance(root, PathPointer): 

            raise TypeError('CorpusReader: expected a string or a PathPointer') 

 

        # If `fileids` is a regexp, then expand it. 

        if isinstance(fileids, compat.string_types): 

            fileids = find_corpus_fileids(root, fileids) 

 

        self._fileids = fileids 

        """A list of the relative paths for the fileids that make up 

        this corpus.""" 

 

        self._root = root 

        """The root directory for this corpus.""" 

 

        # If encoding was specified as a list of regexps, then convert 

        # it to a dictionary. 

        if isinstance(encoding, list): 

            encoding_dict = {} 

            for fileid in self._fileids: 

                for x in encoding: 

                    (regexp, enc) = x 

                    if re.match(regexp, fileid): 

                        encoding_dict[fileid] = enc 

                        break 

            encoding = encoding_dict 

 

        self._encoding = encoding 

        """The default unicode encoding for the fileids that make up 

           this corpus.  If ``encoding`` is None, then the file 

           contents are processed using byte strings (str).""" 

        self._tag_mapping_function = tag_mapping_function 

 

    def __repr__(self): 

        if isinstance(self._root, ZipFilePathPointer): 

            path = '%s/%s' % (self._root.zipfile.filename, self._root.entry) 

        else: 

            path = '%s' % self._root.path 

        return '<%s in %r>' % (self.__class__.__name__, path) 

 

    def readme(self): 

        """ 

        Return the contents of the corpus README file, if it exists. 

        """ 

 

        return self.open("README").read() 

 

    def fileids(self): 

        """ 

        Return a list of file identifiers for the fileids that make up 

        this corpus. 

        """ 

        return self._fileids 

 

    def abspath(self, fileid): 

        """ 

        Return the absolute path for the given file. 

 

        :type file: str 

        :param file: The file identifier for the file whose path 

            should be returned. 

        :rtype: PathPointer 

        """ 

        return self._root.join(fileid) 

 

    def abspaths(self, fileids=None, include_encoding=False, 

                 include_fileid=False): 

        """ 

        Return a list of the absolute paths for all fileids in this corpus; 

        or for the given list of fileids, if specified. 

 

        :type fileids: None or str or list 

        :param fileids: Specifies the set of fileids for which paths should 

            be returned.  Can be None, for all fileids; a list of 

            file identifiers, for a specified set of fileids; or a single 

            file identifier, for a single file.  Note that the return 

            value is always a list of paths, even if ``fileids`` is a 

            single file identifier. 

 

        :param include_encoding: If true, then return a list of 

            ``(path_pointer, encoding)`` tuples. 

 

        :rtype: list(PathPointer) 

        """ 

        if fileids is None: 

            fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): 

            fileids = [fileids] 

 

        paths = [self._root.join(f) for f in fileids] 

 

        if include_encoding and include_fileid: 

            return zip(paths, [self.encoding(f) for f in fileids], fileids) 

        elif include_fileid: 

            return zip(paths, fileids) 

        elif include_encoding: 

            return zip(paths, [self.encoding(f) for f in fileids]) 

        else: 

            return paths 

 

    def open(self, file, sourced=False): 

        """ 

        Return an open stream that can be used to read the given file. 

        If the file's encoding is not None, then the stream will 

        automatically decode the file's contents into unicode. 

 

        :param file: The file identifier of the file to read. 

        """ 

        encoding = self.encoding(file) 

        stream = self._root.join(file).open(encoding) 

        if sourced: 

            stream = SourcedStringStream(stream, file) 

        return stream 

 

    def encoding(self, file): 

        """ 

        Return the unicode encoding for the given corpus file, if known. 

        If the encoding is unknown, or if the given file should be 

        processed using byte strings (str), then return None. 

        """ 

        if isinstance(self._encoding, dict): 

            return self._encoding.get(file) 

        else: 

            return self._encoding 

 

    def _get_root(self): return self._root 

    root = property(_get_root, doc=""" 

        The directory where this corpus is stored. 

 

        :type: PathPointer""") 

 

 

###################################################################### 

#{ Corpora containing categorized items 

###################################################################### 

 

class CategorizedCorpusReader(object): 

    """ 

    A mixin class used to aid in the implementation of corpus readers 

    for categorized corpora.  This class defines the method 

    ``categories()``, which returns a list of the categories for the 

    corpus or for a specified set of fileids; and overrides ``fileids()`` 

    to take a ``categories`` argument, restricting the set of fileids to 

    be returned. 

 

    Subclasses are expected to: 

 

      - Call ``__init__()`` to set up the mapping. 

 

      - Override all view methods to accept a ``categories`` parameter, 

        which can be used *instead* of the ``fileids`` parameter, to 

        select which fileids should be included in the returned view. 

    """ 

 

    def __init__(self, kwargs): 

        """ 

        Initialize this mapping based on keyword arguments, as 

        follows: 

 

          - cat_pattern: A regular expression pattern used to find the 

            category for each file identifier.  The pattern will be 

            applied to each file identifier, and the first matching 

            group will be used as the category label for that file. 

 

          - cat_map: A dictionary, mapping from file identifiers to 

            category labels. 

 

          - cat_file: The name of a file that contains the mapping 

            from file identifiers to categories.  The argument 

            ``cat_delimiter`` can be used to specify a delimiter. 

 

        The corresponding argument will be deleted from ``kwargs``.  If 

        more than one argument is specified, an exception will be 

        raised. 

        """ 

        self._f2c = None #: file-to-category mapping 

        self._c2f = None #: category-to-file mapping 

 

        self._pattern = None #: regexp specifying the mapping 

        self._map = None #: dict specifying the mapping 

        self._file = None #: fileid of file containing the mapping 

        self._delimiter = None #: delimiter for ``self._file`` 

 

        if 'cat_pattern' in kwargs: 

            self._pattern = kwargs['cat_pattern'] 

            del kwargs['cat_pattern'] 

        elif 'cat_map' in kwargs: 

            self._map = kwargs['cat_map'] 

            del kwargs['cat_map'] 

        elif 'cat_file' in kwargs: 

            self._file = kwargs['cat_file'] 

            del kwargs['cat_file'] 

            if 'cat_delimiter' in kwargs: 

                self._delimiter = kwargs['cat_delimiter'] 

                del kwargs['cat_delimiter'] 

        else: 

            raise ValueError('Expected keyword argument cat_pattern or ' 

                             'cat_map or cat_file.') 

 

 

        if ('cat_pattern' in kwargs or 'cat_map' in kwargs or 

            'cat_file' in kwargs): 

            raise ValueError('Specify exactly one of: cat_pattern, ' 

                             'cat_map, cat_file.') 

 

    def _init(self): 

        self._f2c = defaultdict(set) 

        self._c2f = defaultdict(set) 

 

        if self._pattern is not None: 

            for file_id in self._fileids: 

                category = re.match(self._pattern, file_id).group(1) 

                self._add(file_id, category) 

 

        elif self._map is not None: 

            for (file_id, categories) in self._map.items(): 

                for category in categories: 

                    self._add(file_id, category) 

 

        elif self._file is not None: 

            for line in self.open(self._file).readlines(): 

                line = line.strip() 

                file_id, categories = line.split(self._delimiter, 1) 

                if file_id not in self.fileids(): 

                    raise ValueError('In category mapping file %s: %s ' 

                                     'not found' % (self._file, file_id)) 

                for category in categories.split(self._delimiter): 

                    self._add(file_id, category) 

 

    def _add(self, file_id, category): 

        self._f2c[file_id].add(category) 

        self._c2f[category].add(file_id) 

 

    def categories(self, fileids=None): 

        """ 

        Return a list of the categories that are defined for this corpus, 

        or for the file(s) if it is given. 

        """ 

        if self._f2c is None: 

            self._init() 

        if fileids is None: 

            return sorted(self._c2f) 

        if isinstance(fileids, compat.string_types): 

            fileids = [fileids] 

        return sorted(set.union(*[self._f2c[d] for d in fileids])) 

 

    def fileids(self, categories=None): 

        """ 

        Return a list of file identifiers for the files that make up 

        this corpus, or that make up the given category(s) if specified. 

        """ 

        if categories is None: 

            return super(CategorizedCorpusReader, self).fileids() 

        elif isinstance(categories, compat.string_types): 

            if self._f2c is None: 

                self._init() 

            if categories in self._c2f: 

                return sorted(self._c2f[categories]) 

            else: 

                raise ValueError('Category %s not found' % categories) 

        else: 

            if self._f2c is None: 

                self._init() 

            return sorted(set.union(*[self._c2f[c] for c in categories])) 

 

###################################################################### 

#{ Treebank readers 

###################################################################### 

 

#[xx] is it worth it to factor this out? 

class SyntaxCorpusReader(CorpusReader): 

    """ 

    An abstract base class for reading corpora consisting of 

    syntactically parsed text.  Subclasses should define: 

 

      - ``__init__``, which specifies the location of the corpus 

        and a method for detecting the sentence blocks in corpus files. 

      - ``_read_block``, which reads a block from the input stream. 

      - ``_word``, which takes a block and returns a list of list of words. 

      - ``_tag``, which takes a block and returns a list of list of tagged 

        words. 

      - ``_parse``, which takes a block and returns a list of parsed 

        sentences. 

    """ 

    def _parse(self, s): 

        raise NotImplementedError() 

    def _word(self, s): 

        raise NotImplementedError() 

    def _tag(self, s): 

        raise NotImplementedError() 

    def _read_block(self, stream): 

        raise NotImplementedError() 

 

    def raw(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def parsed_sents(self, fileids=None): 

        reader = self._read_parsed_sent_block 

        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) 

                       for fileid, enc in self.abspaths(fileids, True)]) 

 

    def tagged_sents(self, fileids=None, simplify_tags=False): 

        def reader(stream): 

            return self._read_tagged_sent_block(stream, simplify_tags) 

        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) 

                       for fileid, enc in self.abspaths(fileids, True)]) 

 

    def sents(self, fileids=None): 

        reader = self._read_sent_block 

        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) 

                       for fileid, enc in self.abspaths(fileids, True)]) 

 

    def tagged_words(self, fileids=None, simplify_tags=False): 

        def reader(stream): 

            return self._read_tagged_word_block(stream, simplify_tags) 

        return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) 

                       for fileid, enc in self.abspaths(fileids, True)]) 

 

    def words(self, fileids=None): 

        return concat([StreamBackedCorpusView(fileid, 

                                              self._read_word_block, 

                                              encoding=enc) 

                       for fileid, enc in self.abspaths(fileids, True)]) 

 

    #------------------------------------------------------------ 

    #{ Block Readers 

 

    def _read_word_block(self, stream): 

        return sum(self._read_sent_block(stream), []) 

 

    def _read_tagged_word_block(self, stream, simplify_tags=False): 

        return sum(self._read_tagged_sent_block(stream, simplify_tags), []) 

 

    def _read_sent_block(self, stream): 

        return list(filter(None, [self._word(t) for t in self._read_block(stream)])) 

 

    def _read_tagged_sent_block(self, stream, simplify_tags=False): 

        return list(filter(None, [self._tag(t, simplify_tags) 

                             for t in self._read_block(stream)])) 

 

    def _read_parsed_sent_block(self, stream): 

        return list(filter(None, [self._parse(t) for t in self._read_block(stream)])) 

 

    #} End of Block Readers 

    #------------------------------------------------------------