Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

# Natural Language Toolkit: Verbnet Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

import re 

import textwrap 

from collections import defaultdict 

 

from nltk import compat 

from .util import * 

from .api import * 

from .xmldocs import * 

 

class VerbnetCorpusReader(XMLCorpusReader): 

 

    # No unicode encoding param, since the data files are all XML. 

    def __init__(self, root, fileids, wrap_etree=False): 

        XMLCorpusReader.__init__(self, root, fileids, wrap_etree) 

 

        self._lemma_to_class = defaultdict(list) 

        """A dictionary mapping from verb lemma strings to lists of 

        verbnet class identifiers.""" 

 

        self._wordnet_to_class = defaultdict(list) 

        """A dictionary mapping from wordnet identifier strings to 

        lists of verbnet class identifiers.""" 

 

        self._class_to_fileid = {} 

        """A dictionary mapping from class identifiers to 

        corresponding file identifiers.  The keys of this dictionary 

        provide a complete list of all classes and subclasses.""" 

 

        self._shortid_to_longid = {} 

 

        # Initialize the dictionaries.  Use the quick (regexp-based) 

        # method instead of the slow (xml-based) method, because it 

        # runs 2-30 times faster. 

        self._quick_index() 

 

    _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$') 

    """Regular expression that matches (and decomposes) longids""" 

 

    _SHORTID_RE = re.compile(r'[\d+.\-]+$') 

    """Regular expression that matches shortids""" 

 

    _INDEX_RE = re.compile(r'<MEMBER name="\??([^"]+)" wn="([^"]*)"[^>]+>|' 

                           r'<VNSUBCLASS ID="([^"]+)"/?>') 

    """Regular expression used by ``_index()`` to quickly scan the corpus 

       for basic information.""" 

 

    def lemmas(self, classid=None): 

        """ 

        Return a list of all verb lemmas that appear in any class, or 

        in the ``classid`` if specified. 

        """ 

        if classid is None: 

            return sorted(self._lemma_to_class.keys()) 

        else: 

            # [xx] should this include subclass members? 

            vnclass = self.vnclass(classid) 

            return [member.get('name') for member in 

                    vnclass.findall('MEMBERS/MEMBER')] 

 

    def wordnetids(self, classid=None): 

        """ 

        Return a list of all wordnet identifiers that appear in any 

        class, or in ``classid`` if specified. 

        """ 

        if classid is None: 

            return sorted(self._wordnet_to_class.keys()) 

        else: 

            # [xx] should this include subclass members? 

            vnclass = self.vnclass(classid) 

            return sum([member.get('wn','').split() for member in 

                        vnclass.findall('MEMBERS/MEMBER')], []) 

 

    def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None): 

        """ 

        Return a list of the verbnet class identifiers.  If a file 

        identifier is specified, then return only the verbnet class 

        identifiers for classes (and subclasses) defined by that file. 

        If a lemma is specified, then return only verbnet class 

        identifiers for classes that contain that lemma as a member. 

        If a wordnetid is specified, then return only identifiers for 

        classes that contain that wordnetid as a member.  If a classid 

        is specified, then return only identifiers for subclasses of 

        the specified verbnet class. 

        """ 

        if len([x for x in [lemma, wordnetid, fileid, classid] 

                if x is not None]) > 1: 

            raise ValueError('Specify at most one of: fileid, wordnetid, ' 

                             'fileid, classid') 

        if fileid is not None: 

            return [c for (c,f) in self._class_to_fileid.items() 

                    if f == fileid] 

        elif lemma is not None: 

            return self._lemma_to_class[lemma] 

        elif wordnetid is not None: 

            return self._wordnet_to_class[wordnetid] 

        elif classid is not None: 

            xmltree = self.vnclass(classid) 

            return [subclass.get('ID') for subclass in 

                    xmltree.findall('SUBCLASSES/VNSUBCLASS')] 

        else: 

            return sorted(self._class_to_fileid.keys()) 

 

    def vnclass(self, fileid_or_classid): 

        """ 

        Return an ElementTree containing the xml for the specified 

        verbnet class. 

 

        :param fileid_or_classid: An identifier specifying which class 

            should be returned.  Can be a file identifier (such as 

            ``'put-9.1.xml'``), or a verbnet class identifier (such as 

            ``'put-9.1'``) or a short verbnet class identifier (such as 

            ``'9.1'``). 

        """ 

        # File identifier: just return the xml. 

        if fileid_or_classid in self._fileids: 

            return self.xml(fileid_or_classid) 

 

        # Class identifier: get the xml, and find the right elt. 

        classid = self.longid(fileid_or_classid) 

        if classid in self._class_to_fileid: 

            fileid = self._class_to_fileid[self.longid(classid)] 

            tree = self.xml(fileid) 

            if classid == tree.get('ID'): 

                return tree 

            else: 

                for subclass in tree.findall('.//VNSUBCLASS'): 

                    if classid == subclass.get('ID'): 

                        return subclass 

                else: 

                    assert False # we saw it during _index()! 

 

        else: 

            raise ValueError('Unknown identifier %s' % fileid_or_classid) 

 

    def fileids(self, vnclass_ids=None): 

        """ 

        Return a list of fileids that make up this corpus.  If 

        ``vnclass_ids`` is specified, then return the fileids that make 

        up the specified verbnet class(es). 

        """ 

        if vnclass_ids is None: 

            return self._fileids 

        elif isinstance(vnclass_ids, compat.string_types): 

            return [self._class_to_fileid[self.longid(vnclass_ids)]] 

        else: 

            return [self._class_to_fileid[self.longid(vnclass_id)] 

                    for vnclass_id in vnclass_ids] 

 

 

    ###################################################################### 

    #{ Index Initialization 

    ###################################################################### 

 

    def _index(self): 

        """ 

        Initialize the indexes ``_lemma_to_class``, 

        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning 

        through the corpus fileids.  This is fast with cElementTree 

        (<0.1 secs), but quite slow (>10 secs) with the python 

        implementation of ElementTree. 

        """ 

        for fileid in self._fileids: 

            self._index_helper(self.xml(fileid), fileid) 

 

    def _index_helper(self, xmltree, fileid): 

        """Helper for ``_index()``""" 

        vnclass = xmltree.get('ID') 

        self._class_to_fileid[vnclass] = fileid 

        self._shortid_to_longid[self.shortid(vnclass)] = vnclass 

        for member in xmltree.findall('MEMBERS/MEMBER'): 

            self._lemma_to_class[member.get('name')].append(vnclass) 

            for wn in member.get('wn', '').split(): 

                self._wordnet_to_class[wn].append(vnclass) 

        for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'): 

            self._index_helper(subclass, fileid) 

 

    def _quick_index(self): 

        """ 

        Initialize the indexes ``_lemma_to_class``, 

        ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning 

        through the corpus fileids.  This doesn't do proper xml parsing, 

        but is good enough to find everything in the standard verbnet 

        corpus -- and it runs about 30 times faster than xml parsing 

        (with the python ElementTree; only 2-3 times faster with 

        cElementTree). 

        """ 

        # nb: if we got rid of wordnet_to_class, this would run 2-3 

        # times faster. 

        for fileid in self._fileids: 

            vnclass = fileid[:-4] # strip the '.xml' 

            self._class_to_fileid[vnclass] = fileid 

            self._shortid_to_longid[self.shortid(vnclass)] = vnclass 

            for m in self._INDEX_RE.finditer(self.open(fileid).read()): 

                groups = m.groups() 

                if groups[0] is not None: 

                    self._lemma_to_class[groups[0]].append(vnclass) 

                    for wn in groups[1].split(): 

                        self._wordnet_to_class[wn].append(vnclass) 

                elif groups[2] is not None: 

                    self._class_to_fileid[groups[2]] = fileid 

                    vnclass = groups[2] # for <MEMBER> elts. 

                    self._shortid_to_longid[self.shortid(vnclass)] = vnclass 

                else: 

                    assert False, 'unexpected match condition' 

 

    ###################################################################### 

    #{ Identifier conversion 

    ###################################################################### 

 

    def longid(self, shortid): 

        """Given a short verbnet class identifier (eg '37.10'), map it 

        to a long id (eg 'confess-37.10').  If ``shortid`` is already a 

        long id, then return it as-is""" 

        if self._LONGID_RE.match(shortid): 

            return shortid # it's already a longid. 

        elif not self._SHORTID_RE.match(shortid): 

            raise ValueError('vnclass identifier %r not found' % shortid) 

        try: 

            return self._shortid_to_longid[shortid] 

        except KeyError: 

            raise ValueError('vnclass identifier %r not found' % shortid) 

 

    def shortid(self, longid): 

        """Given a long verbnet class identifier (eg 'confess-37.10'), 

        map it to a short id (eg '37.10').  If ``longid`` is already a 

        short id, then return it as-is.""" 

        if self._SHORTID_RE.match(longid): 

            return longid # it's already a shortid. 

        m = self._LONGID_RE.match(longid) 

        if m: 

            return m.group(2) 

        else: 

            raise ValueError('vnclass identifier %r not found' % longid) 

 

    ###################################################################### 

    #{ Pretty Printing 

    ###################################################################### 

 

    def pprint(self, vnclass): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet class. 

 

        :param vnclass: A verbnet class identifier; or an ElementTree 

        containing the xml contents of a verbnet class. 

        """ 

        if isinstance(vnclass, compat.string_types): 

            vnclass = self.vnclass(vnclass) 

 

        s = vnclass.get('ID') + '\n' 

        s += self.pprint_subclasses(vnclass, indent='  ') + '\n' 

        s += self.pprint_members(vnclass, indent='  ') + '\n' 

        s += '  Thematic roles:\n' 

        s += self.pprint_themroles(vnclass, indent='    ') + '\n' 

        s += '  Frames:\n' 

        s += '\n'.join(self.pprint_frame(vnframe, indent='    ') 

                       for vnframe in vnclass.findall('FRAMES/FRAME')) 

        return s 

 

    def pprint_subclasses(self, vnclass, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet class's subclasses. 

 

        :param vnclass: A verbnet class identifier; or an ElementTree 

            containing the xml contents of a verbnet class. 

        """ 

        if isinstance(vnclass, compat.string_types): 

            vnclass = self.vnclass(vnclass) 

 

        subclasses = [subclass.get('ID') for subclass in 

                      vnclass.findall('SUBCLASSES/VNSUBCLASS')] 

        if not subclasses: subclasses = ['(none)'] 

        s = 'Subclasses: ' + ' '.join(subclasses) 

        return textwrap.fill(s, 70, initial_indent=indent, 

                             subsequent_indent=indent+'  ') 

 

    def pprint_members(self, vnclass, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet class's member verbs. 

 

        :param vnclass: A verbnet class identifier; or an ElementTree 

            containing the xml contents of a verbnet class. 

        """ 

        if isinstance(vnclass, compat.string_types): 

            vnclass = self.vnclass(vnclass) 

 

        members = [member.get('name') for member in 

                   vnclass.findall('MEMBERS/MEMBER')] 

        if not members: members = ['(none)'] 

        s = 'Members: ' + ' '.join(members) 

        return textwrap.fill(s, 70, initial_indent=indent, 

                             subsequent_indent=indent+'  ') 

 

    def pprint_themroles(self, vnclass, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet class's thematic roles. 

 

        :param vnclass: A verbnet class identifier; or an ElementTree 

            containing the xml contents of a verbnet class. 

        """ 

        if isinstance(vnclass, compat.string_types): 

            vnclass = self.vnclass(vnclass) 

 

        pieces = [] 

        for themrole in vnclass.findall('THEMROLES/THEMROLE'): 

            piece = indent + '* ' + themrole.get('type') 

            modifiers = ['%(Value)s%(type)s' % restr.attrib 

                         for restr in themrole.findall('SELRESTRS/SELRESTR')] 

            if modifiers: 

                piece += '[%s]' % ' '.join(modifiers) 

            pieces.append(piece) 

 

        return '\n'.join(pieces) 

 

    def pprint_frame(self, vnframe, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet frame. 

 

        :param vnframe: An ElementTree containing the xml contents of 

            a verbnet frame. 

        """ 

        s = self.pprint_description(vnframe, indent) + '\n' 

        s += self.pprint_syntax(vnframe, indent+'  Syntax: ') + '\n' 

        s += indent + '  Semantics:\n' 

        s += self.pprint_semantics(vnframe, indent+'    ') 

        return s 

 

    def pprint_description(self, vnframe, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet frame description. 

 

        :param vnframe: An ElementTree containing the xml contents of 

            a verbnet frame. 

        """ 

        descr = vnframe.find('DESCRIPTION') 

        s = indent + descr.attrib['primary'] 

        if descr.get('secondary', ''): 

            s += ' (%s)' % descr.get('secondary') 

        return s 

 

    def pprint_syntax(self, vnframe, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet frame syntax. 

 

        :param vnframe: An ElementTree containing the xml contents of 

            a verbnet frame. 

        """ 

        pieces = [] 

        for elt in vnframe.find('SYNTAX'): 

            piece = elt.tag 

            modifiers = [] 

            if 'value' in elt.attrib: 

                modifiers.append(elt.get('value')) 

            modifiers += ['%(Value)s%(type)s' % restr.attrib 

                          for restr in (elt.findall('SELRESTRS/SELRESTR') + 

                                        elt.findall('SYNRESTRS/SYNRESTR'))] 

            if modifiers: 

                piece += '[%s]' % ' '.join(modifiers) 

            pieces.append(piece) 

 

        return indent + ' '.join(pieces) 

 

    def pprint_semantics(self, vnframe, indent=''): 

        """ 

        Return a string containing a pretty-printed representation of 

        the given verbnet frame semantics. 

 

        :param vnframe: An ElementTree containing the xml contents of 

            a verbnet frame. 

        """ 

        pieces = [] 

        for pred in vnframe.findall('SEMANTICS/PRED'): 

            args = [arg.get('value') for arg in pred.findall('ARGS/ARG')] 

            pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args))) 

        return '\n'.join(['%s* %s' % (indent, piece) for piece in pieces])