Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

#! /usr/bin/env python 

# KNB Corpus reader 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Masato Hagiwara <hagisan@gmail.com> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html 

from __future__ import print_function 

 

import sys 

 

from nltk import compat 

from nltk.tree import bracket_parse, Tree 

from nltk.parse import DependencyGraph 

 

from nltk.corpus.reader.util import * 

from nltk.corpus.reader.api import * 

 

# default function to convert morphlist to str for tree representation 

_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS').encode('utf-8') 

 

class KNBCorpusReader(SyntaxCorpusReader): 

    """ 

    This class implements: 

      - ``__init__``, which specifies the location of the corpus 

        and a method for detecting the sentence blocks in corpus files. 

      - ``_read_block``, which reads a block from the input stream. 

      - ``_word``, which takes a block and returns a list of list of words. 

      - ``_tag``, which takes a block and returns a list of list of tagged 

        words. 

      - ``_parse``, which takes a block and returns a list of parsed 

        sentences. 

 

    The structure of tagged words: 

      tagged_word = (word(str), tags(tuple)) 

      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) 

    """ 

 

    def __init__(self, root, fileids, encoding=None, morphs2str=_morphs2str_default): 

        """ 

        Initialize KNBCorpusReader 

        morphs2str is a function to convert morphlist to str for tree representation 

        for _parse() 

        """ 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self.morphs2str = morphs2str 

 

    def _read_block(self, stream): 

        # blocks are split by blankline (or EOF) - default 

        return read_blankline_block(stream) 

 

    def _word(self, t): 

        res = [] 

        for line in t.splitlines(): 

            # ignore the Bunsets headers 

            if not re.match(r"EOS|\*|\#|\+", line): 

                cells = line.strip().split(" ") 

                res.append(cells[0]) 

 

        return res 

 

    def _tag(self, t, simplify_tags=False): 

        res = [] 

        for line in t.splitlines(): 

            # ignore the Bunsets headers 

            if not re.match(r"EOS|\*|\#|\+", line): 

                cells = line.strip().split(" ") 

                # convert cells to morph tuples 

                res.append( (cells[0], ' '.join(cells[1:])) ) 

 

        return res 

 

    def _parse(self, t): 

        dg = DependencyGraph() 

        i = 0 

        for line in t.splitlines(): 

            if line.startswith("*") or line.startswith("+"): 

                # start of bunsetsu or tag 

 

                cells = line.strip().split(" ", 3) 

                m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) 

 

                assert m is not None 

 

                node = dg.nodelist[i] 

                node['address'] = i 

                node['rel'] = m.group(2)  # dep_type 

 

                node['word'] = [] 

 

                dep_parent = int(m.group(1)) 

 

                while len(dg.nodelist) < i+1 or len(dg.nodelist) < dep_parent+1: 

                    dg.nodelist.append({'word':[], 'deps':[]}) 

 

                if dep_parent == -1: 

                    dg.root = node 

                else: 

                    dg.nodelist[dep_parent]['deps'].append(i) 

 

                i += 1 

            elif not line.startswith("#"): 

                # normal morph 

                cells = line.strip().split(" ") 

                # convert cells to morph tuples 

                morph = ( cells[0], ' '.join(cells[1:]) ) 

                dg.nodelist[i-1]['word'].append(morph) 

 

        if self.morphs2str: 

            for node in dg.nodelist: 

                node['word'] = self.morphs2str(node['word']) 

 

        return dg.tree() 

 

###################################################################### 

# Demo 

###################################################################### 

 

def demo(): 

 

    import nltk 

    from nltk.corpus.util import LazyCorpusLoader 

 

    root = nltk.data.find('corpora/knbc/corpus1') 

    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") 

               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] 

 

    def _knbc_fileids_sort(x): 

        cells = x.split('-') 

        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) 

 

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, 

                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') 

 

    print(knbc.fileids()[:10]) 

    print(''.join( knbc.words()[:100] )) 

 

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) 

 

    knbc.morphs2str = lambda morphs: '/'.join( 

        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' 

        ).encode('utf-8') 

 

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) 

 

    print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) 

                     for sent in knbc.tagged_sents()[0:2] )) 

 

def test(): 

 

    from nltk.corpus.util import LazyCorpusLoader 

 

    knbc = LazyCorpusLoader( 

        'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') 

    assert isinstance(knbc.words()[0], compat.string_types) 

    assert isinstance(knbc.sents()[0][0], compat.string_types) 

    assert isinstance(knbc.tagged_words()[0], tuple) 

    assert isinstance(knbc.tagged_sents()[0][0], tuple) 

 

if __name__ == '__main__': 

    demo() 

    # test()