Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

# Natural Language Toolkit: Aligned Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# URL: <http://www.nltk.org/> 

# Author: Steven Bird <sb@csse.unimelb.edu.au> 

# For license information, see LICENSE.TXT 

 

from nltk import compat 

from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer 

from nltk.align import AlignedSent 

 

from nltk.corpus.reader.api import CorpusReader 

from nltk.corpus.reader.util import StreamBackedCorpusView, concat,\ 

    read_alignedsent_block 

 

class AlignedCorpusReader(CorpusReader): 

    """ 

    Reader for corpora of word-aligned sentences.  Tokens are assumed 

    to be separated by whitespace.  Sentences begin on separate lines. 

    """ 

    def __init__(self, root, fileids, 

                 sep='/', word_tokenizer=WhitespaceTokenizer(), 

                 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 

                 alignedsent_block_reader=read_alignedsent_block, 

                 encoding=None): 

        """ 

        Construct a new Aligned Corpus reader for a set of documents 

        located at the given root directory.  Example usage: 

 

            >>> root = '/...path to corpus.../' 

            >>> reader = AlignedCorpusReader(root, '.*', '.txt') 

 

        :param root: The root directory for this corpus. 

        :param fileids: A list or regexp specifying the fileids in this corpus. 

        """ 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self._sep = sep 

        self._word_tokenizer = word_tokenizer 

        self._sent_tokenizer = sent_tokenizer 

        self._alignedsent_block_reader = alignedsent_block_reader 

 

    def raw(self, fileids=None): 

        """ 

        :return: the given file(s) as a single string. 

        :rtype: str 

        """ 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def words(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of words 

            and punctuation symbols. 

        :rtype: list(str) 

        """ 

        return concat([AlignedSentCorpusView(fileid, enc, False, False, 

                                             self._word_tokenizer, 

                                             self._sent_tokenizer, 

                                             self._alignedsent_block_reader) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def sents(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of 

            sentences or utterances, each encoded as a list of word 

            strings. 

        :rtype: list(list(str)) 

        """ 

        return concat([AlignedSentCorpusView(fileid, enc, False, True, 

                                             self._word_tokenizer, 

                                             self._sent_tokenizer, 

                                             self._alignedsent_block_reader) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def aligned_sents(self, fileids=None): 

        """ 

        :return: the given file(s) as a list of AlignedSent objects. 

        :rtype: list(AlignedSent) 

        """ 

        return concat([AlignedSentCorpusView(fileid, enc, True, True, 

                                             self._word_tokenizer, 

                                             self._sent_tokenizer, 

                                             self._alignedsent_block_reader) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

class AlignedSentCorpusView(StreamBackedCorpusView): 

    """ 

    A specialized corpus view for aligned sentences. 

    ``AlignedSentCorpusView`` objects are typically created by 

    ``AlignedCorpusReader`` (not directly by nltk users). 

    """ 

    def __init__(self, corpus_file, encoding, aligned, group_by_sent, 

                 word_tokenizer, sent_tokenizer, alignedsent_block_reader): 

        self._aligned = aligned 

        self._group_by_sent = group_by_sent 

        self._word_tokenizer = word_tokenizer 

        self._sent_tokenizer = sent_tokenizer 

        self._alignedsent_block_reader = alignedsent_block_reader 

        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) 

 

    def read_block(self, stream): 

        block = [self._word_tokenizer.tokenize(sent_str) 

                 for alignedsent_str in self._alignedsent_block_reader(stream) 

                 for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)] 

        if self._aligned: 

            block[2] = " ".join(block[2]) # kludge; we shouldn't have tokenized the alignment string 

            block = [AlignedSent(*block)] 

        elif self._group_by_sent: 

            block = [block[0]] 

        else: 

            block = block[0] 

 

        return block