Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

# Natural Language Toolkit: PropBank Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

import re 

import codecs 

from xml.etree import ElementTree 

 

from nltk import compat 

from nltk.tree import Tree 

 

from .util import * 

from .api import * 

 

class PropbankCorpusReader(CorpusReader): 

    """ 

    Corpus reader for the propbank corpus, which augments the Penn 

    Treebank with information about the predicate argument structure 

    of every verb instance.  The corpus consists of two parts: the 

    predicate-argument annotations themselves, and a set of "frameset 

    files" which define the argument labels used by the annotations, 

    on a per-verb basis.  Each "frameset file" contains one or more 

    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is 

    divided into coarse-grained word senses called "rolesets".  For 

    each "roleset", the frameset file provides descriptions of the 

    argument roles, along with examples. 

    """ 

    def __init__(self, root, propfile, framefiles='', 

                 verbsfile=None, parse_fileid_xform=None, 

                 parse_corpus=None, encoding=None): 

        """ 

        :param root: The root directory for this corpus. 

        :param propfile: The name of the file containing the predicate- 

            argument annotations (relative to ``root``). 

        :param framefiles: A list or regexp specifying the frameset 

            fileids for this corpus. 

        :param parse_fileid_xform: A transform that should be applied 

            to the fileids in this corpus.  This should be a function 

            of one argument (a fileid) that returns a string (the new 

            fileid). 

        :param parse_corpus: The corpus containing the parse trees 

            corresponding to this corpus.  These parse trees are 

            necessary to resolve the tree pointers used by propbank. 

        """ 

        # If framefiles is specified as a regexp, expand it. 

        if isinstance(framefiles, compat.string_types): 

            framefiles = find_corpus_fileids(root, framefiles) 

        framefiles = list(framefiles) 

        # Initialze the corpus reader. 

        CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, 

                              encoding) 

 

        # Record our frame fileids & prop file. 

        self._propfile = propfile 

        self._framefiles = framefiles 

        self._verbsfile = verbsfile 

        self._parse_fileid_xform = parse_fileid_xform 

        self._parse_corpus = parse_corpus 

 

    def raw(self, fileids=None): 

        """ 

        :return: the text contents of the given fileids, as a single string. 

        """ 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def instances(self): 

        """ 

        :return: a corpus view that acts as a list of 

        ``PropbankInstance`` objects, one for each verb in the corpus. 

        """ 

        return StreamBackedCorpusView(self.abspath(self._propfile), 

                                      self._read_instance_block, 

                                      encoding=self.encoding(self._propfile)) 

 

    def lines(self): 

        """ 

        :return: a corpus view that acts as a list of strings, one for 

        each line in the predicate-argument annotation file. 

        """ 

        return StreamBackedCorpusView(self.abspath(self._propfile), 

                                      read_line_block, 

                                      encoding=self.encoding(self._propfile)) 

 

    def roleset(self, roleset_id): 

        """ 

        :return: the xml description for the given roleset. 

        """ 

        lemma = roleset_id.split('.')[0] 

        framefile = 'frames/%s.xml' % lemma 

        if framefile not in self._framefiles: 

            raise ValueError('Frameset file for %s not found' % 

                             roleset_id) 

 

        # n.b.: The encoding for XML fileids is specified by the file 

        # itself; so we ignore self._encoding here. 

        etree = ElementTree.parse(self.abspath(framefile).open()).getroot() 

        for roleset in etree.findall('predicate/roleset'): 

            if roleset.attrib['id'] == roleset_id: 

                return roleset 

        else: 

            raise ValueError('Roleset %s not found in %s' % 

                             (roleset_id, framefile)) 

 

    def verbs(self): 

        """ 

        :return: a corpus view that acts as a list of all verb lemmas 

        in this corpus (from the verbs.txt file). 

        """ 

        return StreamBackedCorpusView(self.abspath(self._verbsfile), 

                                      read_line_block, 

                                      encoding=self.encoding(self._verbsfile)) 

 

    def _read_instance_block(self, stream): 

        block = [] 

 

        # Read 100 at a time. 

        for i in range(100): 

            line = stream.readline().strip() 

            if line: 

                block.append(PropbankInstance.parse( 

                    line, self._parse_fileid_xform, 

                    self._parse_corpus)) 

 

        return block 

 

###################################################################### 

#{ Propbank Instance & related datatypes 

###################################################################### 

 

class PropbankInstance(object): 

 

    def __init__(self, fileid, sentnum, wordnum, tagger, roleset, 

                 inflection, predicate, arguments, parse_corpus=None): 

 

        self.fileid = fileid 

        """The name of the file containing the parse tree for this 

        instance's sentence.""" 

 

        self.sentnum = sentnum 

        """The sentence number of this sentence within ``fileid``. 

        Indexing starts from zero.""" 

 

        self.wordnum = wordnum 

        """The word number of this instance's predicate within its 

        containing sentence.  Word numbers are indexed starting from 

        zero, and include traces and other empty parse elements.""" 

 

        self.tagger = tagger 

        """An identifier for the tagger who tagged this instance; or 

        ``'gold'`` if this is an adjuticated instance.""" 

 

        self.roleset = roleset 

        """The name of the roleset used by this instance's predicate. 

        Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to 

        look up information about the roleset.""" 

 

        self.inflection = inflection 

        """A ``PropbankInflection`` object describing the inflection of 

        this instance's predicate.""" 

 

        self.predicate = predicate 

        """A ``PropbankTreePointer`` indicating the position of this 

        instance's predicate within its containing sentence.""" 

 

        self.arguments = tuple(arguments) 

        """A list of tuples (argloc, argid), specifying the location 

        and identifier for each of the predicate's argument in the 

        containing sentence.  Argument identifiers are strings such as 

        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain 

        the predicate.""" 

 

        self.parse_corpus = parse_corpus 

        """A corpus reader for the parse trees corresponding to the 

        instances in this propbank corpus.""" 

 

    def __repr__(self): 

        return ('<PropbankInstance: %s, sent %s, word %s>' % 

                (self.fileid, self.sentnum, self.wordnum)) 

 

    def __str__(self): 

        s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, 

                                   self.tagger, self.roleset, self.inflection) 

        items = self.arguments + ((self.predicate, 'rel'),) 

        for (argloc, argid) in sorted(items): 

            s += ' %s-%s' % (argloc, argid) 

        return s 

 

    def _get_tree(self): 

        if self.parse_corpus is None: return None 

        if self.fileid not in self.parse_corpus.fileids(): return None 

        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] 

    tree = property(_get_tree, doc=""" 

        The parse tree corresponding to this instance, or None if 

        the corresponding tree is not available.""") 

 

    @staticmethod 

    def parse(s, parse_fileid_xform=None, parse_corpus=None): 

        pieces = s.split() 

        if len(pieces) < 7: 

            raise ValueError('Badly formatted propbank line: %r' % s) 

 

        # Divide the line into its basic pieces. 

        (fileid, sentnum, wordnum, 

         tagger, roleset, inflection) = pieces[:6] 

        rel = [p for p in pieces[6:] if p.endswith('-rel')] 

        args = [p for p in pieces[6:] if not p.endswith('-rel')] 

        if len(rel) != 1: 

            raise ValueError('Badly formatted propbank line: %r' % s) 

 

        # Apply the fileid selector, if any. 

        if parse_fileid_xform is not None: 

            fileid = parse_fileid_xform(fileid) 

 

        # Convert sentence & word numbers to ints. 

        sentnum = int(sentnum) 

        wordnum = int(wordnum) 

 

        # Parse the inflection 

        inflection = PropbankInflection.parse(inflection) 

 

        # Parse the predicate location. 

        predicate = PropbankTreePointer.parse(rel[0][:-4]) 

 

        # Parse the arguments. 

        arguments = [] 

        for arg in args: 

            argloc, argid = arg.split('-', 1) 

            arguments.append( (PropbankTreePointer.parse(argloc), argid) ) 

 

        # Put it all together. 

        return PropbankInstance(fileid, sentnum, wordnum, tagger, 

                                roleset, inflection, predicate, 

                                arguments, parse_corpus) 

 

class PropbankPointer(object): 

    """ 

    A pointer used by propbank to identify one or more constituents in 

    a parse tree.  ``PropbankPointer`` is an abstract base class with 

    three concrete subclasses: 

 

      - ``PropbankTreePointer`` is used to point to single constituents. 

      - ``PropbankSplitTreePointer`` is used to point to 'split' 

        constituents, which consist of a sequence of two or more 

        ``PropbankTreePointer`` pointers. 

      - ``PropbankChainTreePointer`` is used to point to entire trace 

        chains in a tree.  It consists of a sequence of pieces, which 

        can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. 

    """ 

    def __init__(self): 

        if self.__class__ == PropbankPoitner: 

            raise NotImplementedError() 

 

class PropbankChainTreePointer(PropbankPointer): 

    def __init__(self, pieces): 

        self.pieces = pieces 

        """A list of the pieces that make up this chain.  Elements may 

           be either ``PropbankSplitTreePointer`` or 

           ``PropbankTreePointer`` pointers.""" 

 

    def __str__(self): 

        return '*'.join('%s' % p for p in self.pieces) 

    def __repr__(self): 

        return '<PropbankChainTreePointer: %s>' % self 

    def select(self, tree): 

        if tree is None: raise ValueError('Parse tree not avaialable') 

        return Tree('*CHAIN*', [p.select(tree) for p in self.pieces]) 

 

class PropbankSplitTreePointer(PropbankPointer): 

    def __init__(self, pieces): 

        self.pieces = pieces 

        """A list of the pieces that make up this chain.  Elements are 

           all ``PropbankTreePointer`` pointers.""" 

 

    def __str__(self): 

        return ','.join('%s' % p for p in self.pieces) 

    def __repr__(self): 

        return '<PropbankSplitTreePointer: %s>' % self 

    def select(self, tree): 

        if tree is None: raise ValueError('Parse tree not avaialable') 

        return Tree('*SPLIT*', [p.select(tree) for p in self.pieces]) 

 

class PropbankTreePointer(PropbankPointer): 

    """ 

    wordnum:height*wordnum:height*... 

    wordnum:height, 

 

    """ 

    def __init__(self, wordnum, height): 

        self.wordnum = wordnum 

        self.height = height 

 

    @staticmethod 

    def parse(s): 

        # Deal with chains (xx*yy*zz) 

        pieces = s.split('*') 

        if len(pieces) > 1: 

            return PropbankChainTreePointer([PropbankTreePointer.parse(elt) 

                                              for elt in pieces]) 

 

        # Deal with split args (xx,yy,zz) 

        pieces = s.split(',') 

        if len(pieces) > 1: 

            return PropbankSplitTreePointer([PropbankTreePointer.parse(elt) 

                                             for elt in pieces]) 

 

        # Deal with normal pointers. 

        pieces = s.split(':') 

        if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s) 

        return PropbankTreePointer(int(pieces[0]), int(pieces[1])) 

 

    def __str__(self): 

        return '%s:%s' % (self.wordnum, self.height) 

 

    def __repr__(self): 

        return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height) 

 

    def __cmp__(self, other): 

        while isinstance(other, (PropbankChainTreePointer, 

                                 PropbankSplitTreePointer)): 

            other = other.pieces[0] 

 

        if not isinstance(other, PropbankTreePointer): 

            return cmp(id(self), id(other)) 

 

        return cmp( (self.wordnum, -self.height), 

                    (other.wordnum, -other.height) ) 

 

    def select(self, tree): 

        if tree is None: raise ValueError('Parse tree not avaialable') 

        return tree[self.treepos(tree)] 

 

    def treepos(self, tree): 

        """ 

        Convert this pointer to a standard 'tree position' pointer, 

        given that it points to the given tree. 

        """ 

        if tree is None: raise ValueError('Parse tree not avaialable') 

        stack = [tree] 

        treepos = [] 

 

        wordnum = 0 

        while True: 

            #print treepos 

            #print stack[-1] 

            # tree node: 

            if isinstance(stack[-1], Tree): 

                # Select the next child. 

                if len(treepos) < len(stack): 

                    treepos.append(0) 

                else: 

                    treepos[-1] += 1 

                # Update the stack. 

                if treepos[-1] < len(stack[-1]): 

                    stack.append(stack[-1][treepos[-1]]) 

                else: 

                    # End of node's child list: pop up a level. 

                    stack.pop() 

                    treepos.pop() 

            # word node: 

            else: 

                if wordnum == self.wordnum: 

                    return tuple(treepos[:len(treepos)-self.height-1]) 

                else: 

                    wordnum += 1 

                    stack.pop() 

 

class PropbankInflection(object): 

    #{ Inflection Form 

    INFINITIVE = 'i' 

    GERUND = 'g' 

    PARTICIPLE = 'p' 

    FINITE = 'v' 

    #{ Inflection Tense 

    FUTURE = 'f' 

    PAST = 'p' 

    PRESENT = 'n' 

    #{ Inflection Aspect 

    PERFECT = 'p' 

    PROGRESSIVE = 'o' 

    PERFECT_AND_PROGRESSIVE = 'b' 

    #{ Inflection Person 

    THIRD_PERSON = '3' 

    #{ Inflection Voice 

    ACTIVE = 'a' 

    PASSIVE = 'p' 

    #{ Inflection 

    NONE = '-' 

    #} 

 

    def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'): 

        self.form = form 

        self.tense = tense 

        self.aspect = aspect 

        self.person = person 

        self.voice = voice 

 

    def __str__(self): 

        return self.form+self.tense+self.aspect+self.person+self.voice 

 

    def __repr__(self): 

        return '<PropbankInflection: %s>' % self 

 

    _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$') 

 

    @staticmethod 

    def parse(s): 

        if not isinstance(s, compat.string_types): 

            raise TypeError('expected a string') 

        if (len(s) != 5 or 

            not PropbankInflection._VALIDATE.match(s)): 

            raise ValueError('Bad propbank inflection string %r' % s) 

        return PropbankInflection(*s)