Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

# Natural Language Toolkit: CONLL Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Read CoNLL-style chunk fileids. 

""" 

 

import os 

import codecs 

import textwrap 

 

from nltk import compat 

from nltk.tree import Tree 

from nltk.util import LazyMap, LazyConcatenation 

 

from .util import * 

from .api import * 

 

class ConllCorpusReader(CorpusReader): 

    """ 

    A corpus reader for CoNLL-style files.  These files consist of a 

    series of sentences, separated by blank lines.  Each sentence is 

    encoded using a table (or "grid") of values, where each line 

    corresponds to a single word, and each column corresponds to an 

    annotation type.  The set of columns used by CoNLL-style files can 

    vary from corpus to corpus; the ``ConllCorpusReader`` constructor 

    therefore takes an argument, ``columntypes``, which is used to 

    specify the columns that are used by a given corpus. 

 

    @todo: Add support for reading from corpora where different 

        parallel files contain different columns. 

    @todo: Possibly add caching of the grid corpus view?  This would 

        allow the same grid view to be used by different data access 

        methods (eg words() and parsed_sents() could both share the 

        same grid corpus view object). 

    @todo: Better support for -DOCSTART-.  Currently, we just ignore 

        it, but it could be used to define methods that retrieve a 

        document at a time (eg parsed_documents()). 

    """ 

 

    #///////////////////////////////////////////////////////////////// 

    # Column Types 

    #///////////////////////////////////////////////////////////////// 

 

    WORDS = 'words'   #: column type for words 

    POS = 'pos'       #: column type for part-of-speech tags 

    TREE = 'tree'     #: column type for parse trees 

    CHUNK = 'chunk'   #: column type for chunk structures 

    NE = 'ne'         #: column type for named entities 

    SRL = 'srl'       #: column type for semantic role labels 

    IGNORE = 'ignore' #: column type for column that should be ignored 

 

    #: A list of all column types supported by the conll corpus reader. 

    COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) 

 

    #///////////////////////////////////////////////////////////////// 

    # Constructor 

    #///////////////////////////////////////////////////////////////// 

 

    def __init__(self, root, fileids, columntypes, 

                 chunk_types=None, top_node='S', pos_in_tree=False, 

                 srl_includes_roleset=True, encoding=None, 

                 tree_class=Tree, tag_mapping_function=None): 

        for columntype in columntypes: 

            if columntype not in self.COLUMN_TYPES: 

                raise ValueError('Bad column type %r' % columntype) 

        if isinstance(chunk_types, compat.string_types): 

            chunk_types = [chunk_types] 

        self._chunk_types = chunk_types 

        self._colmap = dict((c,i) for (i,c) in enumerate(columntypes)) 

        self._pos_in_tree = pos_in_tree 

        self._top_node = top_node # for chunks 

        self._srl_includes_roleset = srl_includes_roleset 

        self._tree_class = tree_class 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self._tag_mapping_function = tag_mapping_function 

 

    #///////////////////////////////////////////////////////////////// 

    # Data Access Methods 

    #///////////////////////////////////////////////////////////////// 

 

    def raw(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def words(self, fileids=None): 

        self._require(self.WORDS) 

        return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids))) 

 

    def sents(self, fileids=None): 

        self._require(self.WORDS) 

        return LazyMap(self._get_words, self._grids(fileids)) 

 

    def tagged_words(self, fileids=None, simplify_tags=False): 

        self._require(self.WORDS, self.POS) 

        def get_tagged_words(grid): 

            return self._get_tagged_words(grid, simplify_tags) 

        return LazyConcatenation(LazyMap(get_tagged_words, 

                                         self._grids(fileids))) 

 

    def tagged_sents(self, fileids=None, simplify_tags=False): 

        self._require(self.WORDS, self.POS) 

        def get_tagged_words(grid): 

            return self._get_tagged_words(grid, simplify_tags) 

        return LazyMap(get_tagged_words, self._grids(fileids)) 

 

    def chunked_words(self, fileids=None, chunk_types=None, 

                      simplify_tags=False): 

        self._require(self.WORDS, self.POS, self.CHUNK) 

        if chunk_types is None: chunk_types = self._chunk_types 

        def get_chunked_words(grid): # capture chunk_types as local var 

            return self._get_chunked_words(grid, chunk_types, simplify_tags) 

        return LazyConcatenation(LazyMap(get_chunked_words, 

                                         self._grids(fileids))) 

 

    def chunked_sents(self, fileids=None, chunk_types=None, 

                      simplify_tags=False): 

        self._require(self.WORDS, self.POS, self.CHUNK) 

        if chunk_types is None: chunk_types = self._chunk_types 

        def get_chunked_words(grid): # capture chunk_types as local var 

            return self._get_chunked_words(grid, chunk_types, simplify_tags) 

        return LazyMap(get_chunked_words, self._grids(fileids)) 

 

    def parsed_sents(self, fileids=None, pos_in_tree=None, simplify_tags=False): 

        self._require(self.WORDS, self.POS, self.TREE) 

        if pos_in_tree is None: pos_in_tree = self._pos_in_tree 

        def get_parsed_sent(grid): # capture pos_in_tree as local var 

            return self._get_parsed_sent(grid, pos_in_tree, simplify_tags) 

        return LazyMap(get_parsed_sent, self._grids(fileids)) 

 

    def srl_spans(self, fileids=None): 

        self._require(self.SRL) 

        return LazyMap(self._get_srl_spans, self._grids(fileids)) 

 

    def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True): 

        self._require(self.WORDS, self.POS, self.TREE, self.SRL) 

        if pos_in_tree is None: pos_in_tree = self._pos_in_tree 

        def get_srl_instances(grid): # capture pos_in_tree as local var 

            return self._get_srl_instances(grid, pos_in_tree) 

        result = LazyMap(get_srl_instances, self._grids(fileids)) 

        if flatten: result = LazyConcatenation(result) 

        return result 

 

    def iob_words(self, fileids=None, simplify_tags=False): 

        """ 

        :return: a list of word/tag/IOB tuples 

        :rtype: list(tuple) 

        :param fileids: the list of fileids that make up this corpus 

        :type fileids: None or str or list 

        """ 

        self._require(self.WORDS, self.POS, self.CHUNK) 

        def get_iob_words(grid): 

            return self._get_iob_words(grid, simplify_tags) 

        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) 

 

    def iob_sents(self, fileids=None, simplify_tags=False): 

        """ 

        :return: a list of lists of word/tag/IOB tuples 

        :rtype: list(list) 

        :param fileids: the list of fileids that make up this corpus 

        :type fileids: None or str or list 

        """ 

        self._require(self.WORDS, self.POS, self.CHUNK) 

        def get_iob_words(grid): 

            return self._get_iob_words(grid, simplify_tags) 

        return LazyMap(get_iob_words, self._grids(fileids)) 

 

    #///////////////////////////////////////////////////////////////// 

    # Grid Reading 

    #///////////////////////////////////////////////////////////////// 

 

    def _grids(self, fileids=None): 

        # n.b.: we could cache the object returned here (keyed on 

        # fileids), which would let us reuse the same corpus view for 

        # different things (eg srl and parse trees). 

        return concat([StreamBackedCorpusView(fileid, self._read_grid_block, 

                                              encoding=enc) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def _read_grid_block(self, stream): 

        grids = [] 

        for block in read_blankline_block(stream): 

            block = block.strip() 

            if not block: continue 

 

            grid = [line.split() for line in block.split('\n')] 

 

            # If there's a docstart row, then discard. ([xx] eventually it 

            # would be good to actually use it) 

            if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': 

                del grid[0] 

 

            # Check that the grid is consistent. 

            for row in grid: 

                if len(row) != len(grid[0]): 

                    raise ValueError('Inconsistent number of columns:\n%s' 

                                     % block) 

            grids.append(grid) 

        return grids 

 

    #///////////////////////////////////////////////////////////////// 

    # Transforms 

    #///////////////////////////////////////////////////////////////// 

    # given a grid, transform it into some representation (e.g., 

    # a list of words or a parse tree). 

 

    def _get_words(self, grid): 

        return self._get_column(grid, self._colmap['words']) 

 

    def _get_tagged_words(self, grid, simplify_tags=False): 

        pos_tags = self._get_column(grid, self._colmap['pos']) 

        if simplify_tags: 

            pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 

        return zip(self._get_column(grid, self._colmap['words']), pos_tags) 

 

    def _get_iob_words(self, grid, simplify_tags=False): 

        pos_tags = self._get_column(grid, self._colmap['pos']) 

        if simplify_tags: 

            pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 

        return zip(self._get_column(grid, self._colmap['words']), pos_tags, 

                   self._get_column(grid, self._colmap['chunk'])) 

 

    def _get_chunked_words(self, grid, chunk_types, simplify_tags=False): 

        # n.b.: this method is very similar to conllstr2tree. 

        words = self._get_column(grid, self._colmap['words']) 

        pos_tags = self._get_column(grid, self._colmap['pos']) 

        if simplify_tags: 

            pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 

        chunk_tags = self._get_column(grid, self._colmap['chunk']) 

 

        stack = [Tree(self._top_node, [])] 

 

        for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): 

            if chunk_tag == 'O': 

                state, chunk_type = 'O', '' 

            else: 

                (state, chunk_type) = chunk_tag.split('-') 

            # If it's a chunk we don't care about, treat it as O. 

            if chunk_types is not None and chunk_type not in chunk_types: 

                state = 'O' 

            # Treat a mismatching I like a B. 

            if state == 'I' and chunk_type != stack[-1].node: 

                state = 'B' 

            # For B or I: close any open chunks 

            if state in 'BO' and len(stack) == 2: 

                stack.pop() 

            # For B: start a new chunk. 

            if state == 'B': 

                new_chunk = Tree(chunk_type, []) 

                stack[-1].append(new_chunk) 

                stack.append(new_chunk) 

            # Add the word token. 

            stack[-1].append((word, pos_tag)) 

 

        return stack[0] 

 

    def _get_parsed_sent(self, grid, pos_in_tree, simplify_tags=False): 

        words = self._get_column(grid, self._colmap['words']) 

        pos_tags = self._get_column(grid, self._colmap['pos']) 

        if simplify_tags: 

            pos_tags = [self._tag_mapping_function(t) for t in pos_tags] 

        parse_tags = self._get_column(grid, self._colmap['tree']) 

 

        treestr = '' 

        for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): 

            if word == '(': word = '-LRB-' 

            if word == ')': word = '-RRB-' 

            if pos_tag == '(': pos_tag = '-LRB-' 

            if pos_tag == ')': pos_tag = '-RRB-' 

            (left, right) = parse_tag.split('*') 

            right = right.count(')')*')' # only keep ')'. 

            treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) 

        try: 

            tree = self._tree_class.parse(treestr) 

        except (ValueError, IndexError): 

            tree = self._tree_class.parse('(%s %s)' % 

                                          (self._top_node, treestr)) 

 

        if not pos_in_tree: 

            for subtree in tree.subtrees(): 

                for i, child in enumerate(subtree): 

                    if (isinstance(child, Tree) and len(child)==1 and 

                        isinstance(child[0], compat.string_types)): 

                        subtree[i] = (child[0], child.node) 

 

        return tree 

 

    def _get_srl_spans(self, grid): 

        """ 

        list of list of (start, end), tag) tuples 

        """ 

        if self._srl_includes_roleset: 

            predicates = self._get_column(grid, self._colmap['srl']+1) 

            start_col = self._colmap['srl']+2 

        else: 

            predicates = self._get_column(grid, self._colmap['srl']) 

            start_col = self._colmap['srl']+1 

 

        # Count how many predicates there are.  This tells us how many 

        # columns to expect for SRL data. 

        num_preds = len([p for p in predicates if p != '-']) 

 

        spanlists = [] 

        for i in range(num_preds): 

            col = self._get_column(grid, start_col+i) 

            spanlist = [] 

            stack = [] 

            for wordnum, srl_tag in enumerate(col): 

                (left, right) = srl_tag.split('*') 

                for tag in left.split('('): 

                    if tag: 

                        stack.append((tag, wordnum)) 

                for i in range(right.count(')')): 

                    (tag, start) = stack.pop() 

                    spanlist.append( ((start, wordnum+1), tag) ) 

            spanlists.append(spanlist) 

 

        return spanlists 

 

    def _get_srl_instances(self, grid, pos_in_tree): 

        tree = self._get_parsed_sent(grid, pos_in_tree) 

        spanlists = self._get_srl_spans(grid) 

        if self._srl_includes_roleset: 

            predicates = self._get_column(grid, self._colmap['srl']+1) 

            rolesets = self._get_column(grid, self._colmap['srl']) 

        else: 

            predicates = self._get_column(grid, self._colmap['srl']) 

            rolesets = [None] * len(predicates) 

 

        instances = ConllSRLInstanceList(tree) 

        for wordnum, predicate in enumerate(predicates): 

            if predicate == '-': continue 

            # Decide which spanlist to use.  Don't assume that they're 

            # sorted in the same order as the predicates (even though 

            # they usually are). 

            for spanlist in spanlists: 

                for (start, end), tag in spanlist: 

                    if wordnum in range(start,end) and tag in ('V', 'C-V'): 

                        break 

                else: continue 

                break 

            else: 

                raise ValueError('No srl column found for %r' % predicate) 

            instances.append(ConllSRLInstance(tree, wordnum, predicate, 

                                              rolesets[wordnum], spanlist)) 

 

        return instances 

 

    #///////////////////////////////////////////////////////////////// 

    # Helper Methods 

    #///////////////////////////////////////////////////////////////// 

 

    def _require(self, *columntypes): 

        for columntype in columntypes: 

            if columntype not in self._colmap: 

                raise ValueError('This corpus does not contain a %s ' 

                                 'column.' % columntype) 

 

    @staticmethod 

    def _get_column(grid, column_index): 

        return [grid[i][column_index] for i in range(len(grid))] 

 

 

class ConllSRLInstance(object): 

    """ 

    An SRL instance from a CoNLL corpus, which identifies and 

    providing labels for the arguments of a single verb. 

    """ 

    # [xx] add inst.core_arguments, inst.argm_arguments? 

 

    def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): 

        self.verb = [] 

        """A list of the word indices of the words that compose the 

           verb whose arguments are identified by this instance. 

           This will contain multiple word indices when multi-word 

           verbs are used (e.g. 'turn on').""" 

 

        self.verb_head = verb_head 

        """The word index of the head word of the verb whose arguments 

           are identified by this instance.  E.g., for a sentence that 

           uses the verb 'turn on,' ``verb_head`` will be the word index 

           of the word 'turn'.""" 

 

        self.verb_stem = verb_stem 

 

        self.roleset = roleset 

 

        self.arguments = [] 

        """A list of ``(argspan, argid)`` tuples, specifying the location 

           and type for each of the arguments identified by this 

           instance.  ``argspan`` is a tuple ``start, end``, indicating 

           that the argument consists of the ``words[start:end]``.""" 

 

        self.tagged_spans = tagged_spans 

        """A list of ``(span, id)`` tuples, specifying the location and 

           type for each of the arguments, as well as the verb pieces, 

           that make up this instance.""" 

 

        self.tree = tree 

        """The parse tree for the sentence containing this instance.""" 

 

        self.words = tree.leaves() 

        """A list of the words in the sentence containing this 

           instance.""" 

 

        # Fill in the self.verb and self.arguments values. 

        for (start, end), tag in tagged_spans: 

            if tag in ('V', 'C-V'): 

                self.verb += list(range(start, end)) 

            else: 

                self.arguments.append( ((start, end), tag) ) 

 

    def __repr__(self): 

        plural = len(self.arguments)!=1 and 's' or '' 

        return '<ConllSRLInstance for %r with %d argument%s>' % ( 

            (self.verb_stem, len(self.arguments), plural)) 

 

    def pprint(self): 

        verbstr = ' '.join(self.words[i][0] for i in self.verb) 

        hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem) 

        s = '' 

        for i, word in enumerate(self.words): 

            if isinstance(word, tuple): word = word[0] 

            for (start, end), argid in self.arguments: 

                if i == start: s += '[%s ' % argid 

                if i == end: s += '] ' 

            if i in self.verb: word = '<<%s>>' % word 

            s += word + ' ' 

        return hdr + textwrap.fill(s.replace(' ]', ']'), 

                                   initial_indent='    ', 

                                   subsequent_indent='    ') 

 

class ConllSRLInstanceList(list): 

    """ 

    Set of instances for a single sentence 

    """ 

    def __init__(self, tree, instances=()): 

        self.tree = tree 

        list.__init__(self, instances) 

 

    def __str__(self): 

        return self.pprint() 

 

    def pprint(self, include_tree=False): 

        # Sanity check: trees should be the same 

        for inst in self: 

            if inst.tree != self.tree: 

                raise ValueError('Tree mismatch!') 

 

        # If desired, add trees: 

        if include_tree: 

            words = self.tree.leaves() 

            pos = [None] * len(words) 

            synt = ['*'] * len(words) 

            self._tree2conll(self.tree, 0, words, pos, synt) 

 

        s = '' 

        for i in range(len(words)): 

            # optional tree columns 

            if include_tree: 

                s += '%-20s ' % words[i] 

                s += '%-8s ' % pos[i] 

                s += '%15s*%-8s ' % tuple(synt[i].split('*')) 

 

            # verb head column 

            for inst in self: 

                if i == inst.verb_head: 

                    s += '%-20s ' % inst.verb_stem 

                    break 

            else: 

                s += '%-20s ' % '-' 

            # Remaining columns: self 

            for inst in self: 

                argstr = '*' 

                for (start, end), argid in inst.tagged_spans: 

                    if i==start: argstr = '(%s%s' % (argid, argstr) 

                    if i==(end-1): argstr += ')' 

                s += '%-12s ' % argstr 

            s += '\n' 

        return s 

 

    def _tree2conll(self, tree, wordnum, words, pos, synt): 

        assert isinstance(tree, Tree) 

        if len(tree) == 1 and isinstance(tree[0], compat.string_types): 

            pos[wordnum] = tree.node 

            assert words[wordnum] == tree[0] 

            return wordnum+1 

        elif len(tree) == 1 and isinstance(tree[0], tuple): 

            assert len(tree[0]) == 2 

            pos[wordnum], pos[wordnum] = tree[0] 

            return wordnum+1 

        else: 

            synt[wordnum] = '(%s%s' % (tree.node, synt[wordnum]) 

            for child in tree: 

                wordnum = self._tree2conll(child, wordnum, words, 

                                                  pos, synt) 

            synt[wordnum-1] += ')' 

            return wordnum 

 

class ConllChunkCorpusReader(ConllCorpusReader): 

    """ 

    A ConllCorpusReader whose data file contains three columns: words, 

    pos, and chunk. 

    """ 

    def __init__(self, root, fileids, chunk_types, encoding=None, 

                 tag_mapping_function=None): 

        ConllCorpusReader.__init__( 

            self, root, fileids, ('words', 'pos', 'chunk'), 

            chunk_types=chunk_types, encoding=encoding, 

            tag_mapping_function=tag_mapping_function)