Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

# Natural Language Toolkit: Plaintext Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Corpus reader for the XML version of the British National Corpus. 

""" 

__docformat__ = 'epytext en' 

 

import re 

 

import xml.etree.ElementTree as ET 

 

from .api import * 

from .util import * 

from .xmldocs import * 

 

class BNCCorpusReader(XMLCorpusReader): 

    """ 

    Corpus reader for the XML version of the British National Corpus. 

    For access to the complete XML data structure, use the ``xml()`` 

    method.  For access to simple word lists and tagged word lists, use 

    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. 

    """ 

    def __init__(self, root, fileids, lazy=True): 

        XMLCorpusReader.__init__(self, root, fileids) 

        self._lazy = lazy 

 

    def words(self, fileids=None, strip_space=True, stem=False): 

        """ 

        :return: the given file(s) as a list of words 

            and punctuation symbols. 

        :rtype: list(str) 

 

        :param strip_space: If true, then strip trailing spaces from 

            word tokens.  Otherwise, leave the spaces on the tokens. 

        :param stem: If true, then use word stems instead of word strings. 

        """ 

        if self._lazy: 

            return concat([BNCWordView(fileid, False, None, 

                                       strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

        else: 

            return concat([self._words(fileid, False, None, 

                                       strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

 

    def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): 

        """ 

        :return: the given file(s) as a list of tagged 

            words and punctuation symbols, encoded as tuples 

            ``(word,tag)``. 

        :rtype: list(tuple(str,str)) 

 

        :param c5: If true, then the tags used will be the more detailed 

            c5 tags.  Otherwise, the simplified tags will be used. 

        :param strip_space: If true, then strip trailing spaces from 

            word tokens.  Otherwise, leave the spaces on the tokens. 

        :param stem: If true, then use word stems instead of word strings. 

        """ 

        if c5: tag = 'c5' 

        else: tag = 'pos' 

        if self._lazy: 

            return concat([BNCWordView(fileid, False, tag, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

        else: 

            return concat([self._words(fileid, False, tag, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

 

    def sents(self, fileids=None, strip_space=True, stem=False): 

        """ 

        :return: the given file(s) as a list of 

            sentences or utterances, each encoded as a list of word 

            strings. 

        :rtype: list(list(str)) 

 

        :param strip_space: If true, then strip trailing spaces from 

            word tokens.  Otherwise, leave the spaces on the tokens. 

        :param stem: If true, then use word stems instead of word strings. 

        """ 

        if self._lazy: 

            return concat([BNCWordView(fileid, True, None, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

        else: 

            return concat([self._words(fileid, True, None, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

 

    def tagged_sents(self, fileids=None, c5=False, strip_space=True, 

                     stem=False): 

        """ 

        :return: the given file(s) as a list of 

            sentences, each encoded as a list of ``(word,tag)`` tuples. 

        :rtype: list(list(tuple(str,str))) 

 

        :param c5: If true, then the tags used will be the more detailed 

            c5 tags.  Otherwise, the simplified tags will be used. 

        :param strip_space: If true, then strip trailing spaces from 

            word tokens.  Otherwise, leave the spaces on the tokens. 

        :param stem: If true, then use word stems instead of word strings. 

        """ 

        if c5: tag = 'c5' 

        else: tag = 'pos' 

        if self._lazy: 

            return concat([BNCWordView(fileid, True, tag, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

        else: 

            return concat([self._words(fileid, True, tag, strip_space, stem) 

                           for fileid in self.abspaths(fileids)]) 

 

    def _words(self, fileid, bracket_sent, tag, strip_space, stem): 

        """ 

        Helper used to implement the view methods -- returns a list of 

        words or a list of sentences, optionally tagged. 

 

        :param fileid: The name of the underlying file. 

        :param bracket_sent: If true, include sentence bracketing. 

        :param tag: The name of the tagset to use, or None for no tags. 

        :param strip_space: If true, strip spaces from word tokens. 

        :param stem: If true, then substitute stems for words. 

        """ 

        result = [] 

 

        xmldoc = ElementTree.parse(fileid).getroot() 

        for xmlsent in xmldoc.findall('.//s'): 

            sent = [] 

            for xmlword in _all_xmlwords_in(xmlsent): 

                word = xmlword.text 

                if not word: 

                    word = "" # fixes issue 337? 

                if strip_space or stem: word = word.strip() 

                if stem: word = xmlword.get('hw', word) 

                if tag == 'c5': 

                    word = (word, xmlword.get('c5')) 

                elif tag == 'pos': 

                    word = (word, xmlword.get('pos', xmlword.get('c5'))) 

                sent.append(word) 

            if bracket_sent: 

                result.append(BNCSentence(xmlsent.attrib['n'], sent)) 

            else: 

                result.extend(sent) 

 

        assert None not in result 

        return result 

 

def _all_xmlwords_in(elt, result=None): 

    if result is None: result = [] 

    for child in elt: 

        if child.tag in ('c', 'w'): result.append(child) 

        else: _all_xmlwords_in(child, result) 

    return result 

 

class BNCSentence(list): 

    """ 

    A list of words, augmented by an attribute ``num`` used to record 

    the sentence identifier (the ``n`` attribute from the XML). 

    """ 

    def __init__(self, num, items): 

        self.num = num 

        list.__init__(self, items) 

 

class BNCWordView(XMLCorpusView): 

    """ 

    A stream backed corpus view specialized for use with the BNC corpus. 

    """ 

    def __init__(self, fileid, sent, tag, strip_space, stem): 

        """ 

        :param fileid: The name of the underlying file. 

        :param sent: If true, include sentence bracketing. 

        :param tag: The name of the tagset to use, or None for no tags. 

        :param strip_space: If true, strip spaces from word tokens. 

        :param stem: If true, then substitute stems for words. 

        """ 

        if sent: tagspec = '.*/s' 

        else: tagspec = '.*/s/(.*/)?(c|w)' 

        self._sent = sent 

        self._tag = tag 

        self._strip_space = strip_space 

        self._stem = stem 

 

        XMLCorpusView.__init__(self, fileid, tagspec) 

 

        # Read in a tasty header. 

        self._open() 

        self.read_block(self._stream, '.*/teiHeader$', self.handle_header) 

        self.close() 

 

        # Reset tag context. 

        self._tag_context = {0: ()} 

 

 

    title = None #: Title of the document. 

    author = None #: Author of the document. 

    editor = None #: Editor 

    resps = None #: Statement of responsibility 

 

    def handle_header(self, elt, context): 

        # Set up some metadata! 

        titles = elt.findall('titleStmt/title') 

        if titles: self.title = '\n'.join( 

            [title.text.strip() for title in titles]) 

 

        authors = elt.findall('titleStmt/author') 

        if authors: self.author = '\n'.join( 

            [author.text.strip() for author in authors]) 

 

        editors = elt.findall('titleStmt/editor') 

        if editors: self.editor = '\n'.join( 

            [editor.text.strip() for editor in editors]) 

 

        resps = elt.findall('titleStmt/respStmt') 

        if resps: self.resps = '\n\n'.join([ 

            '\n'.join([resp_elt.text.strip() for resp_elt in resp]) 

            for resp in resps]) 

 

    def handle_elt(self, elt, context): 

        if self._sent: return self.handle_sent(elt) 

        else: return self.handle_word(elt) 

 

    def handle_word(self, elt): 

        word = elt.text 

        if not word: 

            word = "" # fixes issue 337? 

        if self._strip_space or self._stem: 

            word = word.strip() 

        if self._stem: 

            word = elt.get('hw', word) 

        if self._tag == 'c5': 

            word = (word, elt.get('c5')) 

        elif self._tag == 'pos': 

            word = (word, elt.get('pos', elt.get('c5'))) 

        return word 

 

    def handle_sent(self, elt): 

        sent = [] 

        for child in elt: 

            if child.tag == 'mw': 

                sent += [self.handle_word(w) for w in child] 

            elif child.tag in ('w','c'): 

                sent.append(self.handle_word(child)) 

            else: 

                raise ValueError('Unexpected element %s' % child.tag) 

        return BNCSentence(elt.attrib['n'], sent)