Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

# Natural Language Toolkit: Probabilistic Chart Parsers 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Classes and interfaces for associating probabilities with tree 

structures that represent the internal organization of a text.  The 

probabilistic parser module defines ``BottomUpProbabilisticChartParser``. 

 

``BottomUpProbabilisticChartParser`` is an abstract class that implements 

a bottom-up chart parser for ``PCFG`` grammars.  It maintains a queue of edges, 

and adds them to the chart one at a time.  The ordering of this queue 

is based on the probabilities associated with the edges, allowing the 

parser to expand more likely edges before less likely ones.  Each 

subclass implements a different queue ordering, producing different 

search strategies.  Currently the following subclasses are defined: 

 

  - ``InsideChartParser`` searches edges in decreasing order of 

    their trees' inside probabilities. 

  - ``RandomChartParser`` searches edges in random order. 

  - ``LongestChartParser`` searches edges in decreasing order of their 

    location's length. 

 

The ``BottomUpProbabilisticChartParser`` constructor has an optional 

argument beam_size.  If non-zero, this controls the size of the beam 

(aka the edge queue).  This option is most useful with InsideChartParser. 

""" 

from __future__ import print_function 

 

##////////////////////////////////////////////////////// 

##  Bottom-Up PCFG Chart Parser 

##////////////////////////////////////////////////////// 

 

# [XX] This might not be implemented quite right -- it would be better 

# to associate probabilities with child pointer lists. 

 

from functools import reduce 

from nltk.tree import Tree, ProbabilisticTree 

from nltk.grammar import Nonterminal, WeightedGrammar 

 

from nltk.parse.api import ParserI 

from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule 

 

# Probabilistic edges 

class ProbabilisticLeafEdge(LeafEdge): 

    def prob(self): return 1.0 

 

class ProbabilisticTreeEdge(TreeEdge): 

    def __init__(self, prob, *args, **kwargs): 

        self._prob = prob 

        TreeEdge.__init__(self, *args, **kwargs) 

 

    def prob(self): return self._prob 

 

    def __cmp__(self, other): 

        if self._prob != other.prob(): return -1 

        return TreeEdge.__cmp__(self, other) 

 

    @staticmethod 

    def from_production(production, index, p): 

        return ProbabilisticTreeEdge(p, (index, index), production.lhs(), 

                                     production.rhs(), 0) 

 

# Rules using probabilistic edges 

class ProbabilisticBottomUpInitRule(AbstractChartRule): 

    NUM_EDGES=0 

    def apply_iter(self, chart, grammar): 

        for index in range(chart.num_leaves()): 

            new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) 

            if chart.insert(new_edge, ()): 

                yield new_edge 

 

class ProbabilisticBottomUpPredictRule(AbstractChartRule): 

    NUM_EDGES=1 

    def apply_iter(self, chart, grammar, edge): 

        if edge.is_incomplete(): return 

        for prod in grammar.productions(): 

            if edge.lhs() == prod.rhs()[0]: 

                new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) 

                if chart.insert(new_edge, ()): 

                    yield new_edge 

 

class ProbabilisticFundamentalRule(AbstractChartRule): 

    NUM_EDGES=2 

    def apply_iter(self, chart, grammar, left_edge, right_edge): 

        # Make sure the rule is applicable. 

        if not (left_edge.end() == right_edge.start() and 

                next(left_edge) == right_edge.lhs() and 

                left_edge.is_incomplete() and right_edge.is_complete()): 

            return 

 

        # Construct the new edge. 

        p = left_edge.prob() * right_edge.prob() 

        new_edge = ProbabilisticTreeEdge(p, 

                            span=(left_edge.start(), right_edge.end()), 

                            lhs=left_edge.lhs(), rhs=left_edge.rhs(), 

                            dot=left_edge.dot()+1) 

 

        # Add it to the chart, with appropriate child pointers. 

        changed_chart = False 

        for cpl1 in chart.child_pointer_lists(left_edge): 

            if chart.insert(new_edge, cpl1+(right_edge,)): 

                changed_chart = True 

 

        # If we changed the chart, then generate the edge. 

        if changed_chart: yield new_edge 

 

class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule): 

    NUM_EDGES=1 

 

    _fundamental_rule = ProbabilisticFundamentalRule() 

 

    def apply_iter(self, chart, grammar, edge1): 

        fr = self._fundamental_rule 

        if edge1.is_incomplete(): 

            # edge1 = left_edge; edge2 = right_edge 

            for edge2 in chart.select(start=edge1.end(), is_complete=True, 

                                     lhs=next(edge1)): 

                for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 

                    yield new_edge 

        else: 

            # edge2 = left_edge; edge1 = right_edge 

            for edge2 in chart.select(end=edge1.start(), is_complete=False, 

                                     next=edge1.lhs()): 

                for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 

                    yield new_edge 

 

    def __str__(self): return 'Fundamental Rule' 

 

class BottomUpProbabilisticChartParser(ParserI): 

    """ 

    An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to 

    record partial results.  ``BottomUpProbabilisticChartParser`` maintains 

    a queue of edges that can be added to the chart.  This queue is 

    initialized with edges for each token in the text that is being 

    parsed.  ``BottomUpProbabilisticChartParser`` inserts these edges into 

    the chart one at a time, starting with the most likely edges, and 

    proceeding to less likely edges.  For each edge that is added to 

    the chart, it may become possible to insert additional edges into 

    the chart; these are added to the queue.  This process continues 

    until enough complete parses have been generated, or until the 

    queue is empty. 

 

    The sorting order for the queue is not specified by 

    ``BottomUpProbabilisticChartParser``.  Different sorting orders will 

    result in different search strategies.  The sorting order for the 

    queue is defined by the method ``sort_queue``; subclasses are required 

    to provide a definition for this method. 

 

    :type _grammar: PCFG 

    :ivar _grammar: The grammar used to parse sentences. 

    :type _trace: int 

    :ivar _trace: The level of tracing output that should be generated 

        when parsing a text. 

    """ 

    def __init__(self, grammar, beam_size=0, trace=0): 

        """ 

        Create a new ``BottomUpProbabilisticChartParser``, that uses 

        ``grammar`` to parse texts. 

 

        :type grammar: PCFG 

        :param grammar: The grammar used to parse texts. 

        :type beam_size: int 

        :param beam_size: The maximum length for the parser's edge queue. 

        :type trace: int 

        :param trace: The level of tracing that should be used when 

            parsing a text.  ``0`` will generate no tracing output; 

            and higher numbers will produce more verbose tracing 

            output. 

        """ 

        if not isinstance(grammar, WeightedGrammar): 

            raise ValueError("The grammar must be probabilistic WeightedGrammar") 

        self._grammar = grammar 

        self.beam_size = beam_size 

        self._trace = trace 

 

    def grammar(self): 

        return self._grammar 

 

    def trace(self, trace=2): 

        """ 

        Set the level of tracing output that should be generated when 

        parsing a text. 

 

        :type trace: int 

        :param trace: The trace level.  A trace level of ``0`` will 

            generate no tracing output; and higher trace levels will 

            produce more verbose tracing output. 

        :rtype: None 

        """ 

        self._trace = trace 

 

    # TODO: change this to conform more with the standard ChartParser 

    def nbest_parse(self, tokens, n=None): 

        self._grammar.check_coverage(tokens) 

        chart = Chart(list(tokens)) 

        grammar = self._grammar 

 

        # Chart parser rules. 

        bu_init = ProbabilisticBottomUpInitRule() 

        bu = ProbabilisticBottomUpPredictRule() 

        fr = SingleEdgeProbabilisticFundamentalRule() 

 

        # Our queue! 

        queue = [] 

 

        # Initialize the chart. 

        for edge in bu_init.apply_iter(chart, grammar): 

            if self._trace > 1: 

                print('  %-50s [%s]' % (chart.pp_edge(edge,width=2), 

                                        edge.prob())) 

            queue.append(edge) 

 

        while len(queue) > 0: 

            # Re-sort the queue. 

            self.sort_queue(queue, chart) 

 

            # Prune the queue to the correct size if a beam was defined 

            if self.beam_size: 

                self._prune(queue, chart) 

 

            # Get the best edge. 

            edge = queue.pop() 

            if self._trace > 0: 

                print('  %-50s [%s]' % (chart.pp_edge(edge,width=2), 

                                        edge.prob())) 

 

            # Apply BU & FR to it. 

            queue.extend(bu.apply(chart, grammar, edge)) 

            queue.extend(fr.apply(chart, grammar, edge)) 

 

        # Get a list of complete parses. 

        parses = chart.parses(grammar.start(), ProbabilisticTree) 

 

        # Assign probabilities to the trees. 

        prod_probs = {} 

        for prod in grammar.productions(): 

            prod_probs[prod.lhs(), prod.rhs()] = prod.prob() 

        for parse in parses: 

            self._setprob(parse, prod_probs) 

 

        # Sort by probability 

        parses.sort(lambda a,b: cmp(b.prob(), a.prob())) 

 

        return parses[:n] 

 

    def _setprob(self, tree, prod_probs): 

        if tree.prob() is not None: return 

 

        # Get the prob of the CFG production. 

        lhs = Nonterminal(tree.node) 

        rhs = [] 

        for child in tree: 

            if isinstance(child, Tree): 

                rhs.append(Nonterminal(child.node)) 

            else: 

                rhs.append(child) 

        prob = prod_probs[lhs, tuple(rhs)] 

 

        # Get the probs of children. 

        for child in tree: 

            if isinstance(child, Tree): 

                self._setprob(child, prod_probs) 

                prob *= child.prob() 

 

        tree.set_prob(prob) 

 

    def sort_queue(self, queue, chart): 

        """ 

        Sort the given queue of ``Edge`` objects, placing the edge that should 

        be tried first at the beginning of the queue.  This method 

        will be called after each ``Edge`` is added to the queue. 

 

        :param queue: The queue of ``Edge`` objects to sort.  Each edge in 

            this queue is an edge that could be added to the chart by 

            the fundamental rule; but that has not yet been added. 

        :type queue: list(Edge) 

        :param chart: The chart being used to parse the text.  This 

            chart can be used to provide extra information for sorting 

            the queue. 

        :type chart: Chart 

        :rtype: None 

        """ 

        raise NotImplementedError() 

 

    def _prune(self, queue, chart): 

        """ Discard items in the queue if the queue is longer than the beam.""" 

        if len(queue) > self.beam_size: 

            split = len(queue)-self.beam_size 

            if self._trace > 2: 

                for edge in queue[:split]: 

                    print('  %-50s [DISCARDED]' % chart.pp_edge(edge,2)) 

            del queue[:split] 

 

class InsideChartParser(BottomUpProbabilisticChartParser): 

    """ 

    A bottom-up parser for ``PCFG`` grammars that tries edges in descending 

    order of the inside probabilities of their trees.  The "inside 

    probability" of a tree is simply the 

    probability of the entire tree, ignoring its context.  In 

    particular, the inside probability of a tree generated by 

    production *p* with children *c[1], c[2], ..., c[n]* is 

    *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside 

    probability of a token is 1 if it is present in the text, and 0 if 

    it is absent. 

 

    This sorting order results in a type of lowest-cost-first search 

    strategy. 

    """ 

    # Inherit constructor. 

    def sort_queue(self, queue, chart): 

        """ 

        Sort the given queue of edges, in descending order of the 

        inside probabilities of the edges' trees. 

 

        :param queue: The queue of ``Edge`` objects to sort.  Each edge in 

            this queue is an edge that could be added to the chart by 

            the fundamental rule; but that has not yet been added. 

        :type queue: list(Edge) 

        :param chart: The chart being used to parse the text.  This 

            chart can be used to provide extra information for sorting 

            the queue. 

        :type chart: Chart 

        :rtype: None 

        """ 

        queue.sort(lambda e1,e2:cmp(e1.prob(), e2.prob())) 

 

# Eventually, this will become some sort of inside-outside parser: 

# class InsideOutsideParser(BottomUpProbabilisticChartParser): 

#     def __init__(self, grammar, trace=0): 

#         # Inherit docs. 

#         BottomUpProbabilisticChartParser.__init__(self, grammar, trace) 

# 

#         # Find the best path from S to each nonterminal 

#         bestp = {} 

#         for production in grammar.productions(): bestp[production.lhs()]=0 

#         bestp[grammar.start()] = 1.0 

# 

#         for i in range(len(grammar.productions())): 

#             for production in grammar.productions(): 

#                 lhs = production.lhs() 

#                 for elt in production.rhs(): 

#                     bestp[elt] = max(bestp[lhs]*production.prob(), 

#                                      bestp.get(elt,0)) 

# 

#         self._bestp = bestp 

#         for (k,v) in self._bestp.items(): print k,v 

# 

#     def _cmp(self, e1, e2): 

#         return cmp(e1.structure()[PROB]*self._bestp[e1.lhs()], 

#                    e2.structure()[PROB]*self._bestp[e2.lhs()]) 

# 

#     def sort_queue(self, queue, chart): 

#         queue.sort(self._cmp) 

 

import random 

class RandomChartParser(BottomUpProbabilisticChartParser): 

    """ 

    A bottom-up parser for ``PCFG`` grammars that tries edges in random order. 

    This sorting order results in a random search strategy. 

    """ 

    # Inherit constructor 

    def sort_queue(self, queue, chart): 

        i = random.randint(0, len(queue)-1) 

        (queue[-1], queue[i]) = (queue[i], queue[-1]) 

 

class UnsortedChartParser(BottomUpProbabilisticChartParser): 

    """ 

    A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order. 

    """ 

    # Inherit constructor 

    def sort_queue(self, queue, chart): return 

 

class LongestChartParser(BottomUpProbabilisticChartParser): 

    """ 

    A bottom-up parser for ``PCFG`` grammars that tries longer edges before 

    shorter ones.  This sorting order results in a type of best-first 

    search strategy. 

    """ 

    # Inherit constructor 

    def sort_queue(self, queue, chart): 

        queue.sort(lambda e1,e2: cmp(e1.length(), e2.length())) 

 

##////////////////////////////////////////////////////// 

##  Test Code 

##////////////////////////////////////////////////////// 

 

def demo(choice=None, draw_parses=None, print_parses=None): 

    """ 

    A demonstration of the probabilistic parsers.  The user is 

    prompted to select which demo to run, and how many parses should 

    be found; and then each parser is run on the same demo, and a 

    summary of the results are displayed. 

    """ 

    import sys, time 

    from nltk import tokenize, toy_pcfg1, toy_pcfg2 

    from nltk.parse import pchart 

 

    # Define two demos.  Each demo has a sentence and a grammar. 

    demos = [('I saw John with my telescope', toy_pcfg1), 

             ('the boy saw Jack with Bob under the table with a telescope', 

              toy_pcfg2)] 

 

    if choice is None: 

        # Ask the user which demo they want to use. 

        print() 

        for i in range(len(demos)): 

            print('%3s: %s' % (i+1, demos[i][0])) 

            print('     %r' % demos[i][1]) 

            print() 

        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') 

        choice = int(sys.stdin.readline().strip())-1 

    try: 

        sent, grammar = demos[choice] 

    except: 

        print('Bad sentence number') 

        return 

 

    # Tokenize the sentence. 

    tokens = sent.split() 

 

    # Define a list of parsers.  We'll use all parsers. 

    parsers = [ 

        pchart.InsideChartParser(grammar), 

        pchart.RandomChartParser(grammar), 

        pchart.UnsortedChartParser(grammar), 

        pchart.LongestChartParser(grammar), 

        pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)   # was BeamParser 

        ] 

 

    # Run the parsers on the tokenized sentence. 

    times = [] 

    average_p = [] 

    num_parses = [] 

    all_parses = {} 

    for parser in parsers: 

        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) 

        parser.trace(3) 

        t = time.time() 

        parses = parser.nbest_parse(tokens) 

        times.append(time.time()-t) 

        if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 

        else: p = 0 

        average_p.append(p) 

        num_parses.append(len(parses)) 

        for p in parses: all_parses[p.freeze()] = 1 

 

    # Print some summary statistics 

    print() 

    print('       Parser      Beam | Time (secs)   # Parses   Average P(parse)') 

    print('------------------------+------------------------------------------') 

    for i in range(len(parsers)): 

        print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, 

                                             parsers[i].beam_size, 

                                             times[i],num_parses[i],average_p[i])) 

    parses = all_parses.keys() 

    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) 

    else: p = 0 

    print('------------------------+------------------------------------------') 

    print('%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)) 

 

    if draw_parses is None: 

        # Ask the user if we should draw the parses. 

        print() 

        print('Draw parses (y/n)? ', end=' ') 

        draw_parses = sys.stdin.readline().strip().lower().startswith('y') 

    if draw_parses: 

        from nltk.draw.tree import draw_trees 

        print('  please wait...') 

        draw_trees(*parses) 

 

    if print_parses is None: 

        # Ask the user if we should print the parses. 

        print() 

        print('Print parses (y/n)? ', end=' ') 

        print_parses = sys.stdin.readline().strip().lower().startswith('y') 

    if print_parses: 

        for parse in parses: 

            print(parse) 

 

if __name__ == '__main__': 

    demo()