Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

# -*- coding: iso-8859-1 -*- 

 

# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Selina Dennis <selina@tranzfusion.net> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 

English Prose (YCOE), a 1.5 million word syntactically-annotated 

corpus of Old English prose texts. The corpus is distributed by the 

Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included 

with NLTK. 

 

The YCOE corpus is divided into 100 files, each representing 

an Old English prose text. Tags used within each text complies 

to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm 

""" 

 

import os 

import re 

 

from nltk import compat 

from nltk.tokenize import RegexpTokenizer 

from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 

from nltk.corpus.reader.tagged import TaggedCorpusReader 

 

from .util import * 

from .api import * 

 

class YCOECorpusReader(CorpusReader): 

    """ 

    Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 

    English Prose (YCOE), a 1.5 million word syntactically-annotated 

    corpus of Old English prose texts. 

    """ 

    def __init__(self, root, encoding=None): 

        CorpusReader.__init__(self, root, [], encoding) 

 

        self._psd_reader = YCOEParseCorpusReader( 

            self.root.join('psd'), '.*', '.psd', encoding=encoding) 

        self._pos_reader = YCOETaggedCorpusReader( 

            self.root.join('pos'), '.*', '.pos') 

 

        # Make sure we have a consistent set of items: 

        documents = set(f[:-4] for f in self._psd_reader.fileids()) 

        if set(f[:-4] for f in self._pos_reader.fileids()) != documents: 

            raise ValueError('Items in "psd" and "pos" ' 

                             'subdirectories do not match.') 

 

        fileids = sorted(['%s.psd' % doc for doc in documents] + 

                       ['%s.pos' % doc for doc in documents]) 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self._documents = sorted(documents) 

 

    def documents(self, fileids=None): 

        """ 

        Return a list of document identifiers for all documents in 

        this corpus, or for the documents with the given file(s) if 

        specified. 

        """ 

        if fileids is None: 

            return self._documents 

        if isinstance(fileids, compat.string_types): 

            fileids = [fileids] 

        for f in fileids: 

            if f not in self._fileids: 

                raise KeyError('File id %s not found' % fileids) 

        # Strip off the '.pos' and '.psd' extensions. 

        return sorted(set(f[:-4] for f in fileids)) 

 

    def fileids(self, documents=None): 

        """ 

        Return a list of file identifiers for the files that make up 

        this corpus, or that store the given document(s) if specified. 

        """ 

        if documents is None: 

            return self._fileids 

        elif isinstance(documents, compat.string_types): 

            documents = [documents] 

        return sorted(set(['%s.pos' % doc for doc in documents] + 

                          ['%s.psd' % doc for doc in documents])) 

 

    def _getfileids(self, documents, subcorpus): 

        """ 

        Helper that selects the appropriate fileids for a given set of 

        documents from a given subcorpus (pos or psd). 

        """ 

        if documents is None: 

            documents = self._documents 

        else: 

            if isinstance(documents, compat.string_types): 

                documents = [documents] 

            for document in documents: 

                if document not in self._documents: 

                    if document[-4:] in ('.pos', '.psd'): 

                        raise ValueError( 

                            'Expected a document identifier, not a file ' 

                            'identifier.  (Use corpus.documents() to get ' 

                            'a list of document identifiers.') 

                    else: 

                        raise ValueError('Document identifier %s not found' 

                                         % document) 

        return ['%s.%s' % (d, subcorpus) for d in documents] 

 

    # Delegate to one of our two sub-readers: 

    def words(self, documents=None): 

        return self._pos_reader.words(self._getfileids(documents, 'pos')) 

    def sents(self, documents=None): 

        return self._pos_reader.sents(self._getfileids(documents, 'pos')) 

    def paras(self, documents=None): 

        return self._pos_reader.paras(self._getfileids(documents, 'pos')) 

    def tagged_words(self, documents=None): 

        return self._pos_reader.tagged_words(self._getfileids(documents, 'pos')) 

    def tagged_sents(self, documents=None): 

        return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos')) 

    def tagged_paras(self, documents=None): 

        return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos')) 

    def parsed_sents(self, documents=None): 

        return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd')) 

 

 

class YCOEParseCorpusReader(BracketParseCorpusReader): 

    """Specialized version of the standard bracket parse corpus reader 

    that strips out (CODE ...) and (ID ...) nodes.""" 

    def _parse(self, t): 

        t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) 

        if re.match(r'\s*\(\s*\)\s*$', t): return None 

        return BracketParseCorpusReader._parse(self, t) 

 

class YCOETaggedCorpusReader(TaggedCorpusReader): 

    def __init__(self, root, items, encoding=None): 

        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' 

        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) 

        TaggedCorpusReader.__init__(self, root, items, sep='_', 

                                    sent_tokenizer=sent_tokenizer) 

 

#: A list of all documents and their titles in ycoe. 

documents = { 

    'coadrian.o34': 'Adrian and Ritheus', 

    'coaelhom.o3': 'Ælfric, Supplemental Homilies', 

    'coaelive.o3': 'Ælfric\'s Lives of Saints', 

    'coalcuin': 'Alcuin De virtutibus et vitiis', 

    'coalex.o23': 'Alexander\'s Letter to Aristotle', 

    'coapollo.o3': 'Apollonius of Tyre', 

    'coaugust': 'Augustine', 

    'cobede.o2': 'Bede\'s History of the English Church', 

    'cobenrul.o3': 'Benedictine Rule', 

    'coblick.o23': 'Blickling Homilies', 

    'coboeth.o2': 'Boethius\' Consolation of Philosophy', 

    'cobyrhtf.o3': 'Byrhtferth\'s Manual', 

    'cocanedgD': 'Canons of Edgar (D)', 

    'cocanedgX': 'Canons of Edgar (X)', 

    'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 

    'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 

    'cochad.o24': 'Saint Chad', 

    'cochdrul': 'Chrodegang of Metz, Rule', 

    'cochristoph': 'Saint Christopher', 

    'cochronA.o23': 'Anglo-Saxon Chronicle A', 

    'cochronC': 'Anglo-Saxon Chronicle C', 

    'cochronD': 'Anglo-Saxon Chronicle D', 

    'cochronE.o34': 'Anglo-Saxon Chronicle E', 

    'cocura.o2': 'Cura Pastoralis', 

    'cocuraC': 'Cura Pastoralis (Cotton)', 

    'codicts.o34': 'Dicts of Cato', 

    'codocu1.o1': 'Documents 1 (O1)', 

    'codocu2.o12': 'Documents 2 (O1/O2)', 

    'codocu2.o2': 'Documents 2 (O2)', 

    'codocu3.o23': 'Documents 3 (O2/O3)', 

    'codocu3.o3': 'Documents 3 (O3)', 

    'codocu4.o24': 'Documents 4 (O2/O4)', 

    'coeluc1': 'Honorius of Autun, Elucidarium 1', 

    'coeluc2': 'Honorius of Autun, Elucidarium 1', 

    'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 

    'coeuphr': 'Saint Euphrosyne', 

    'coeust': 'Saint Eustace and his companions', 

    'coexodusP': 'Exodus (P)', 

    'cogenesiC': 'Genesis (C)', 

    'cogregdC.o24': 'Gregory\'s Dialogues (C)', 

    'cogregdH.o23': 'Gregory\'s Dialogues (H)', 

    'coherbar': 'Pseudo-Apuleius, Herbarium', 

    'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 

    'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 

    'cojames': 'Saint James', 

    'colacnu.o23': 'Lacnunga', 

    'colaece.o2': 'Leechdoms', 

    'colaw1cn.o3': 'Laws, Cnut I', 

    'colaw2cn.o3': 'Laws, Cnut II', 

    'colaw5atr.o3': 'Laws, Æthelred V', 

    'colaw6atr.o3': 'Laws, Æthelred VI', 

    'colawaf.o2': 'Laws, Alfred', 

    'colawafint.o2': 'Alfred\'s Introduction to Laws', 

    'colawger.o34': 'Laws, Gerefa', 

    'colawine.ox2': 'Laws, Ine', 

    'colawnorthu.o3': 'Northumbra Preosta Lagu', 

    'colawwllad.o4': 'Laws, William I, Lad', 

    'coleofri.o4': 'Leofric', 

    'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 

    'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 

    'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 

    'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 

    'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 

    'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 

    'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 

    'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 

    'comargaC.o34': 'Saint Margaret (C)', 

    'comargaT': 'Saint Margaret (T)', 

    'comart1': 'Martyrology, I', 

    'comart2': 'Martyrology, II', 

    'comart3.o23': 'Martyrology, III', 

    'comarvel.o23': 'Marvels of the East', 

    'comary': 'Mary of Egypt', 

    'coneot': 'Saint Neot', 

    'conicodA': 'Gospel of Nicodemus (A)', 

    'conicodC': 'Gospel of Nicodemus (C)', 

    'conicodD': 'Gospel of Nicodemus (D)', 

    'conicodE': 'Gospel of Nicodemus (E)', 

    'coorosiu.o2': 'Orosius', 

    'cootest.o3': 'Heptateuch', 

    'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 

    'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 

    'coprefcura.o2': 'Preface to the Cura Pastoralis', 

    'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 

    'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 

    'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 

    'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 

    'corood': 'History of the Holy Rood-Tree', 

    'cosevensl': 'Seven Sleepers', 

    'cosolilo': 'St. Augustine\'s Soliloquies', 

    'cosolsat1.o4': 'Solomon and Saturn I', 

    'cosolsat2': 'Solomon and Saturn II', 

    'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 

    'coverhom': 'Vercelli Homilies', 

    'coverhomE': 'Vercelli Homilies (E)', 

    'coverhomL': 'Vercelli Homilies (L)', 

    'covinceB': 'Saint Vincent (Bodley 343)', 

    'covinsal': 'Vindicta Salvatoris', 

    'cowsgosp.o3': 'West-Saxon Gospels', 

    'cowulf.o34': 'Wulfstan\'s Homilies' 

    }