Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

# Natural Language Toolkit: Toolbox Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Greg Aumann <greg_aumann@sil.org> 

#         Stuart Robinson <Stuart.Robinson@mpi.nl> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Module for reading, writing and manipulating 

Toolbox databases and settings fileids. 

""" 

 

import os 

import re 

import codecs 

 

from nltk import compat 

from nltk.toolbox import ToolboxData 

 

from .util import * 

from .api import * 

 

class ToolboxCorpusReader(CorpusReader): 

    def xml(self, fileids, key=None): 

        return concat([ToolboxData(path, enc).parse(key) 

                       for (path, enc) in self.abspaths(fileids, True)]) 

 

    def fields(self, fileids, strip=True, unwrap=True, encoding=None, 

               errors='strict', unicode_fields=None): 

        return concat([list(ToolboxData(fileid,enc).fields( 

                             strip, unwrap, encoding, errors, unicode_fields)) 

                       for (fileid, enc) 

                       in self.abspaths(fileids, include_encoding=True)]) 

 

    # should probably be done lazily: 

    def entries(self, fileids, **kwargs): 

        if 'key' in kwargs: 

            key = kwargs['key'] 

            del kwargs['key'] 

        else: 

            key = 'lx'  # the default key in MDF 

        entries = [] 

        for marker, contents in self.fields(fileids, **kwargs): 

            if marker == key: 

                entries.append((contents, [])) 

            else: 

                try: 

                    entries[-1][-1].append((marker, contents)) 

                except IndexError: 

                    pass 

        return entries 

 

    def words(self, fileids, key='lx'): 

        return [contents for marker, contents in self.fields(fileids) if marker == key] 

 

    def raw(self, fileids): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

 

def demo(): 

    pass 

 

if __name__ == '__main__': 

    demo()