Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

# Natural Language Toolkit: Discourse Processing 

# 

# Author: Ewan Klein <ewan@inf.ed.ac.uk> 

#         Dan Garrette <dhgarrette@gmail.com> 

# 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Module for incrementally developing simple discourses, and checking for semantic ambiguity, 

consistency and informativeness. 

 

Many of the ideas are based on the CURT family of programs of Blackburn and Bos 

(see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html). 

 

Consistency checking is carried out  by using the ``mace`` module to call the Mace4 model builder. 

Informativeness checking is carried out with a call to ``Prover.prove()`` from 

the ``inference``  module. 

 

``DiscourseTester`` is a constructor for discourses. 

The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list 

is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example:: 

 

    s0: A boxer walks 

    s1: Every boxer chases a girl 

 

Each sentence can be ambiguous between a number of readings, each of which receives a 

"reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example:: 

 

    s0 readings: 

 

    s0-r1: some x.(boxer(x) & walk(x)) 

    s0-r0: some x.(boxerdog(x) & walk(x)) 

 

A "thread" is a list of readings, represented as a list of ``rid``\ s. 

Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*. 

For example:: 

 

    d0: ['s0-r0', 's1-r0'] 

 

The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. 

(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show 

those threads which are consistent (taking into account any background assumptions). 

""" 

from __future__ import print_function 

 

import os 

from operator import and_, add 

from functools import reduce 

 

from nltk.data import show_cfg 

from nltk.tag import RegexpTagger 

from nltk.parse import load_parser 

from nltk.parse.malt import MaltParser 

from nltk.sem.drt import resolve_anaphora, AnaphoraResolutionException 

from nltk.sem.glue import DrtGlue 

 

from nltk.inference.mace import MaceCommand 

from nltk.inference.prover9 import Prover9Command 

 

 

class ReadingCommand(object): 

    def parse_to_readings(self, sentence): 

        """ 

        :param sentence: the sentence to read 

        :type sentence: str 

        """ 

        raise NotImplementedError() 

 

    def process_thread(self, sentence_readings): 

        """ 

        This method should be used to handle dependencies between readings such 

        as resolving anaphora. 

 

        :param sentence_readings: readings to process 

        :type sentence_readings: list(Expression) 

        :return: the list of readings after processing 

        :rtype: list(Expression) 

        """ 

        return sentence_readings 

 

    def combine_readings(self, readings): 

        """ 

        :param readings: readings to combine 

        :type readings: list(Expression) 

        :return: one combined reading 

        :rtype: Expression 

        """ 

        raise NotImplementedError() 

 

 

class CfgReadingCommand(ReadingCommand): 

    def __init__(self, gramfile=None): 

        """ 

        :param gramfile: name of file where grammar can be loaded 

        :type gramfile: str 

        """ 

        if gramfile is None: 

            self._gramfile = 'grammars/book_grammars/discourse.fcfg' 

        else: 

            self._gramfile = gramfile 

        self._parser = load_parser(self._gramfile) 

 

    def parse_to_readings(self, sentence): 

        """:see: ReadingCommand.parse_to_readings()""" 

        from nltk.sem import root_semrep 

        tokens = sentence.split() 

        trees = self._parser.nbest_parse(tokens) 

        return [root_semrep(tree) for tree in trees] 

 

    def combine_readings(self, readings): 

        """:see: ReadingCommand.combine_readings()""" 

        return reduce(and_, readings) 

 

 

class DrtGlueReadingCommand(ReadingCommand): 

    def __init__(self, semtype_file=None, remove_duplicates=False, 

                 depparser=None): 

        """ 

        :param semtype_file: name of file where grammar can be loaded 

        :param remove_duplicates: should duplicates be removed? 

        :param depparser: the dependency parser 

        """ 

        if semtype_file is None: 

            semtype_file = 'drt_glue.semtype' 

        self._glue = DrtGlue(semtype_file=semtype_file, 

                             remove_duplicates=remove_duplicates, 

                             depparser=depparser) 

 

    def parse_to_readings(self, sentence): 

        """:see: ReadingCommand.parse_to_readings()""" 

        return self._glue.parse_to_meaning(sentence) 

 

    def process_thread(self, sentence_readings): 

        """:see: ReadingCommand.process_thread()""" 

        try: 

            return [self.combine_readings(sentence_readings)] 

        except AnaphoraResolutionException: 

            return [] 

 

    def combine_readings(self, readings): 

        """:see: ReadingCommand.combine_readings()""" 

        thread_reading = reduce(add, readings) 

        return resolve_anaphora(thread_reading.simplify()) 

 

 

class DiscourseTester(object): 

    """ 

    Check properties of an ongoing discourse. 

    """ 

    def __init__(self, input, reading_command=None, background=None): 

        """ 

        Initialize a ``DiscourseTester``. 

 

        :param input: the discourse sentences 

        :type input: list of str 

        :param background: Formulas which express background assumptions 

        :type background: list(Expression) 

        """ 

        self._input = input 

        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)]) 

        self._models = None 

        self._readings = {} 

        if reading_command is None: 

            self._reading_command = CfgReadingCommand() 

        else: 

            self._reading_command = reading_command 

        self._threads = {} 

        self._filtered_threads = {} 

        if background is not None: 

            from nltk.sem.logic import Expression 

            for e in background: 

                assert isinstance(e, Expression) 

            self._background = background 

        else: 

            self._background = [] 

 

    ############################### 

    # Sentences 

    ############################### 

 

    def sentences(self): 

        """ 

        Display the list of sentences in the current discourse. 

        """ 

        for id in sorted(self._sentences): 

            print("%s: %s" % (id, self._sentences[id])) 

 

    def add_sentence(self, sentence, informchk=False, consistchk=False,): 

        """ 

        Add a sentence to the current discourse. 

 

        Updates ``self._input`` and ``self._sentences``. 

        :param sentence: An input sentence 

        :type sentence: str 

        :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``. 

        :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``. 

 

        """ 

        # check whether the new sentence is informative (i.e. not entailed by the previous discourse) 

        if informchk: 

            self.readings(verbose=False) 

            for tid in sorted(self._threads): 

                assumptions = [reading for (rid, reading) in self.expand_threads(tid)] 

                assumptions += self._background 

                for sent_reading in self._get_readings(sentence): 

                    tp = Prover9Command(goal=sent_reading, assumptions=assumptions) 

                    if tp.prove(): 

                        print("Sentence '%s' under reading '%s':" % (sentence, str(sent_reading))) 

                        print("Not informative relative to thread '%s'" % tid) 

 

        self._input.append(sentence) 

        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 

        # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of 

        # of assumptions 

        if consistchk: 

            self.readings(verbose=False) 

            self.models(show=False) 

 

    def retract_sentence(self, sentence, verbose=True): 

        """ 

        Remove a sentence from the current discourse. 

 

        Updates ``self._input``, ``self._sentences`` and ``self._readings``. 

        :param sentence: An input sentence 

        :type sentence: str 

        :param verbose: If ``True``,  report on the updated list of sentences. 

        """ 

        try: 

            self._input.remove(sentence) 

        except ValueError: 

            print("Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence) 

            self.sentences() 

            return None 

        self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 

        self.readings(verbose=False) 

        if verbose: 

            print("Current sentences are ") 

            self.sentences() 

 

    def grammar(self): 

        """ 

        Print out the grammar in use for parsing input sentences 

        """ 

        show_cfg(self._reading_command._gramfile) 

 

    ############################### 

    # Readings and Threads 

    ############################### 

 

    def _get_readings(self, sentence): 

        """ 

        Build a list of semantic readings for a sentence. 

 

        :rtype: list(Expression) 

        """ 

        return self._reading_command.parse_to_readings(sentence) 

 

    def _construct_readings(self): 

        """ 

        Use ``self._sentences`` to construct a value for ``self._readings``. 

        """ 

        # re-initialize self._readings in case we have retracted a sentence 

        self._readings = {} 

        for sid in self._sentences: 

            sentence = self._sentences[sid] 

            readings = self._get_readings(sentence) 

            self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading.simplify()) 

                                                        for rid, reading in enumerate(readings)]) 

 

    def _construct_threads(self): 

        """ 

        Use ``self._readings`` to construct a value for ``self._threads`` 

        and use the model builder to construct a value for ``self._filtered_threads`` 

        """ 

        thread_list = [[]] 

        for sid in sorted(self._readings): 

            thread_list = self.multiply(thread_list, sorted(self._readings[sid])) 

        self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)]) 

        # re-initialize the filtered threads 

        self._filtered_threads = {} 

        # keep the same ids, but only include threads which get models 

        consistency_checked = self._check_consistency(self._threads) 

        for (tid, thread) in self._threads.items(): 

            if (tid, True) in consistency_checked: 

                self._filtered_threads[tid] = thread 

 

 

    def _show_readings(self, sentence=None): 

        """ 

        Print out the readings for  the discourse (or a single sentence). 

        """ 

        if sentence is not None: 

            print("The sentence '%s' has these readings:" % sentence) 

            for r in [str(reading) for reading in (self._get_readings(sentence))]: 

                print("    %s" % r) 

        else: 

            for sid in sorted(self._readings): 

                print() 

                print('%s readings:' % sid) 

                print() #'-' * 30 

                for rid in sorted(self._readings[sid]): 

                    lf = self._readings[sid][rid] 

                    #TODO lf = lf.normalize('[xyz]\d*', 'z%d') 

                    print("%s: %s" % (rid, lf)) 

 

    def _show_threads(self, filter=False, show_thread_readings=False): 

        """ 

        Print out the value of ``self._threads`` or ``self._filtered_hreads`` 

        """ 

        if filter: 

            threads = self._filtered_threads 

        else: 

            threads = self._threads 

        for tid in sorted(threads): 

            if show_thread_readings: 

                readings = [self._readings[rid.split('-')[0]][rid] 

                            for rid in self._threads[tid]] 

                try: 

                    thread_reading = ": %s" % \ 

                              self._reading_command.combine_readings(readings) 

                except Exception as e: 

                    thread_reading = ': INVALID: %s' % e.__class__.__name__ 

            else: 

                thread_reading = '' 

 

            print("%s:" % tid, self._threads[tid], thread_reading) 

 

 

    def readings(self, sentence=None, threaded=False, verbose=True, 

                 filter=False, show_thread_readings=False): 

        """ 

        Construct and show the readings of the discourse (or of a single sentence). 

 

        :param sentence: test just this sentence 

        :type sentence: str 

        :param threaded: if ``True``, print out each thread ID and the corresponding thread. 

        :param filter: if ``True``, only print out consistent thread IDs and threads. 

        """ 

        self._construct_readings() 

        self._construct_threads() 

 

        # if we are filtering or showing thread readings, show threads 

        if filter or show_thread_readings: 

            threaded = True 

 

        if verbose: 

            if not threaded: 

                self._show_readings(sentence=sentence) 

            else: 

                self._show_threads(filter=filter, 

                                   show_thread_readings=show_thread_readings) 

 

    def expand_threads(self, thread_id, threads=None): 

        """ 

        Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread. 

 

        :param thread_id: thread ID 

        :type thread_id: str 

        :param threads: a mapping from thread IDs to lists of reading IDs 

        :type threads: dict 

        :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID 

        :rtype: list of tuple 

        """ 

        if threads is None: 

            threads = self._threads 

        return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]] 

 

 

    ############################### 

    # Models and Background 

    ############################### 

 

    def _check_consistency(self, threads, show=False, verbose=False): 

        results = [] 

        for tid in sorted(threads): 

            assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)] 

            assumptions = self._reading_command.process_thread(assumptions) 

            if assumptions: 

                assumptions += self._background 

                # if Mace4 finds a model, it always seems to find it quickly 

                mb = MaceCommand(None, assumptions, max_models=20) 

                modelfound = mb.build_model() 

            else: 

                modelfound = False 

            results.append((tid, modelfound)) 

            if show: 

                spacer(80) 

                print("Model for Discourse Thread %s" % tid) 

                spacer(80) 

                if verbose: 

                    for a in assumptions: 

                        print(a) 

                    spacer(80) 

                if modelfound: 

                    print(mb.model(format='cooked')) 

                else: 

                    print("No model found!\n") 

        return results 

 

    def models(self, thread_id=None, show=True, verbose=False): 

        """ 

        Call Mace4 to build a model for each current discourse thread. 

 

        :param thread_id: thread ID 

        :type thread_id: str 

        :param show: If ``True``, display the model that has been found. 

        """ 

        self._construct_readings() 

        self._construct_threads() 

        if thread_id is None: 

            threads = self._threads 

        else: 

            threads = {thread_id: self._threads[thread_id]} 

 

        for (tid, modelfound) in self._check_consistency(threads, show=show, verbose=verbose): 

            idlist = [rid for rid in threads[tid]] 

 

            if not modelfound: 

                print("Inconsistent discourse: %s %s:" % (tid, idlist)) 

                for  rid, reading in [(rid, str(reading))  for (rid, reading) in self.expand_threads(tid)]: 

                    print("    %s: %s" % (rid, reading)) 

                print() 

            else: 

                print("Consistent discourse: %s %s:" % (tid, idlist)) 

                for  rid, reading in [(rid, str(reading))  for (rid, reading) in self.expand_threads(tid)]: 

                    print("    %s: %s" % (rid, reading)) 

                print() 

 

    def add_background(self, background, verbose=False): 

        """ 

        Add a list of background assumptions for reasoning about the discourse. 

 

        When called,  this method also updates the discourse model's set of readings and threads. 

        :param background: Formulas which contain background information 

        :type background: list(Expression) 

        """ 

        from nltk.sem import Expression 

        for (count, e) in enumerate(background): 

            assert isinstance(e, Expression) 

            if verbose: 

                print("Adding assumption %s to background" % count) 

            self._background.append(e) 

 

        #update the state 

        self._construct_readings() 

        self._construct_threads() 

 

    def background(self): 

        """ 

        Show the current background assumptions. 

        """ 

        for e in self._background: 

            print(str(e)) 

 

   ############################### 

    # Misc 

    ############################### 

 

    @staticmethod 

    def multiply(discourse, readings): 

        """ 

        Multiply every thread in ``discourse`` by every reading in ``readings``. 

 

        Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns 

        [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] 

 

        :param discourse: the current list of readings 

        :type discourse: list of lists 

        :param readings: an additional list of readings 

        :type readings: list(Expression) 

        :rtype: A list of lists 

        """ 

        result = [] 

        for sublist in discourse: 

            for r in readings: 

                new = [] 

                new += sublist 

                new.append(r) 

                result.append(new) 

        return result 

 

#multiply = DiscourseTester.multiply 

#L1 = [['A'], ['B']] 

#L2 = ['a', 'b', 'c'] 

#print multiply(L1,L2) 

 

def parse_fol(s): 

    """ 

    Temporarily duplicated from ``nltk.sem.util``. 

    Convert a  file of first order formulas into a list of ``Expression`` objects. 

 

    :param s: the contents of the file 

    :type s: str 

    :return: a list of parsed formulas. 

    :rtype: list(Expression) 

    """ 

    from nltk.sem import LogicParser 

    statements = [] 

    lp = LogicParser() 

    for linenum, line in enumerate(s.splitlines()): 

        line = line.strip() 

        if line.startswith('#') or line=='': continue 

        try: 

            statements.append(lp.parse(line)) 

        except Error: 

            raise ValueError('Unable to parse line %s: %s' % (linenum, line)) 

    return statements 

 

############################### 

# Demo 

############################### 

 

def discourse_demo(reading_command=None): 

    """ 

    Illustrate the various methods of ``DiscourseTester`` 

    """ 

    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'], 

                         reading_command) 

    dt.models() 

    print() 

    #dt.grammar() 

    print() 

    dt.sentences() 

    print() 

    dt.readings() 

    print() 

    dt.readings(threaded=True) 

    print() 

    dt.models('d1') 

    dt.add_sentence('John is a boxer') 

    print() 

    dt.sentences() 

    print() 

    dt.readings(threaded=True) 

    print() 

    dt = DiscourseTester(['A student dances', 'Every student is a person'], 

                         reading_command) 

    print() 

    dt.add_sentence('No person dances', consistchk=True) 

    print() 

    dt.readings() 

    print() 

    dt.retract_sentence('No person dances', verbose=True) 

    print() 

    dt.models() 

    print() 

    dt.readings('A person dances') 

    print() 

    dt.add_sentence('A person dances', informchk=True) 

    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 

                          'Vincent is married', 'Fido barks'], 

                          reading_command) 

    dt.readings(filter=True) 

    import nltk.data 

    background = nltk.data.load('/grammars/book_grammars/background.fol') 

    print() 

    dt.add_background(background, verbose=False) 

    dt.background() 

    print() 

    dt.readings(filter=True) 

    print() 

    dt.models() 

 

 

def drt_discourse_demo(reading_command=None): 

    """ 

    Illustrate the various methods of ``DiscourseTester`` 

    """ 

    dt = DiscourseTester(['every dog chases a boy', 'he runs'], 

                         reading_command) 

    dt.models() 

    print() 

    dt.sentences() 

    print() 

    dt.readings() 

    print() 

    dt.readings(show_thread_readings=True) 

    print() 

    dt.readings(filter=True, show_thread_readings=True) 

 

 

def spacer(num=30): 

    print('-' * num) 

 

def demo(): 

    discourse_demo() 

 

    tagger = RegexpTagger( 

        [('^(chases|runs)$', 'VB'), 

         ('^(a)$', 'ex_quant'), 

         ('^(every)$', 'univ_quant'), 

         ('^(dog|boy)$', 'NN'), 

         ('^(he)$', 'PRP') 

    ]) 

    depparser = MaltParser(tagger=tagger) 

    drt_discourse_demo(DrtGlueReadingCommand(remove_duplicates=False, 

                                             depparser=depparser)) 

 

if __name__ == '__main__': 

    demo()