Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

# Natural Language Toolkit: Corpus Reader Utilities 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

import os 

import bisect 

import re 

import tempfile 

from functools import reduce 

try: 

    import cPickle as pickle 

except ImportError: 

    import pickle 

 

# Use the c version of ElementTree, which is faster, if possible: 

try: from xml.etree import cElementTree as ElementTree 

except ImportError: from xml.etree import ElementTree 

 

from nltk import compat 

from nltk.tokenize import wordpunct_tokenize 

from nltk.internals import slice_bounds 

from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer 

from nltk.data import SeekableUnicodeStreamReader 

from nltk.sourcedstring import SourcedStringStream 

from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation 

 

###################################################################### 

#{ Corpus View 

###################################################################### 

 

class StreamBackedCorpusView(AbstractLazySequence): 

    """ 

    A 'view' of a corpus file, which acts like a sequence of tokens: 

    it can be accessed by index, iterated over, etc.  However, the 

    tokens are only constructed as-needed -- the entire corpus is 

    never stored in memory at once. 

 

    The constructor to ``StreamBackedCorpusView`` takes two arguments: 

    a corpus fileid (specified as a string or as a ``PathPointer``); 

    and a block reader.  A "block reader" is a function that reads 

    zero or more tokens from a stream, and returns them as a list.  A 

    very simple example of a block reader is: 

 

        >>> def simple_block_reader(stream): 

        ...     return stream.readline().split() 

 

    This simple block reader reads a single line at a time, and 

    returns a single token (consisting of a string) for each 

    whitespace-separated substring on the line. 

 

    When deciding how to define the block reader for a given 

    corpus, careful consideration should be given to the size of 

    blocks handled by the block reader.  Smaller block sizes will 

    increase the memory requirements of the corpus view's internal 

    data structures (by 2 integers per block).  On the other hand, 

    larger block sizes may decrease performance for random access to 

    the corpus.  (But note that larger block sizes will *not* 

    decrease performance for iteration.) 

 

    Internally, ``CorpusView`` maintains a partial mapping from token 

    index to file position, with one entry per block.  When a token 

    with a given index *i* is requested, the ``CorpusView`` constructs 

    it as follows: 

 

      1. First, it searches the toknum/filepos mapping for the token 

         index closest to (but less than or equal to) *i*. 

 

      2. Then, starting at the file position corresponding to that 

         index, it reads one block at a time using the block reader 

         until it reaches the requested token. 

 

    The toknum/filepos mapping is created lazily: it is initially 

    empty, but every time a new block is read, the block's 

    initial token is added to the mapping.  (Thus, the toknum/filepos 

    map has one entry per block.) 

 

    In order to increase efficiency for random access patterns that 

    have high degrees of locality, the corpus view may cache one or 

    more blocks. 

 

    :note: Each ``CorpusView`` object internally maintains an open file 

        object for its underlying corpus file.  This file should be 

        automatically closed when the ``CorpusView`` is garbage collected, 

        but if you wish to close it manually, use the ``close()`` 

        method.  If you access a ``CorpusView``'s items after it has been 

        closed, the file object will be automatically re-opened. 

 

    :warning: If the contents of the file are modified during the 

        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior 

        is undefined. 

 

    :warning: If a unicode encoding is specified when constructing a 

        ``CorpusView``, then the block reader may only call 

        ``stream.seek()`` with offsets that have been returned by 

        ``stream.tell()``; in particular, calling ``stream.seek()`` with 

        relative offsets, or with offsets based on string lengths, may 

        lead to incorrect behavior. 

 

    :ivar _block_reader: The function used to read 

        a single block from the underlying file stream. 

    :ivar _toknum: A list containing the token index of each block 

        that has been processed.  In particular, ``_toknum[i]`` is the 

        token index of the first token in block ``i``.  Together 

        with ``_filepos``, this forms a partial mapping between token 

        indices and file positions. 

    :ivar _filepos: A list containing the file position of each block 

        that has been processed.  In particular, ``_toknum[i]`` is the 

        file position of the first character in block ``i``.  Together 

        with ``_toknum``, this forms a partial mapping between token 

        indices and file positions. 

    :ivar _stream: The stream used to access the underlying corpus file. 

    :ivar _len: The total number of tokens in the corpus, if known; 

        or None, if the number of tokens is not yet known. 

    :ivar _eofpos: The character position of the last character in the 

        file.  This is calculated when the corpus view is initialized, 

        and is used to decide when the end of file has been reached. 

    :ivar _cache: A cache of the most recently read block.  It 

       is encoded as a tuple (start_toknum, end_toknum, tokens), where 

       start_toknum is the token index of the first token in the block; 

       end_toknum is the token index of the first token not in the 

       block; and tokens is a list of the tokens in the block. 

    """ 

    def __init__(self, fileid, block_reader=None, startpos=0, 

                 encoding=None, source=None): 

        """ 

        Create a new corpus view, based on the file ``fileid``, and 

        read with ``block_reader``.  See the class documentation 

        for more information. 

 

        :param fileid: The path to the file that is read by this 

            corpus view.  ``fileid`` can either be a string or a 

            ``PathPointer``. 

 

        :param startpos: The file position at which the view will 

            start reading.  This can be used to skip over preface 

            sections. 

 

        :param encoding: The unicode encoding that should be used to 

            read the file's contents.  If no encoding is specified, 

            then the file's contents will be read as a non-unicode 

            string (i.e., a str). 

 

        :param source: If specified, then use an ``SourcedStringStream`` 

            to annotate all strings read from the file with 

            information about their start offset, end ofset, 

            and docid.  The value of ``source`` will be used as the docid. 

        """ 

        if block_reader: 

            self.read_block = block_reader 

        # Initialize our toknum/filepos mapping. 

        self._toknum = [0] 

        self._filepos = [startpos] 

        self._encoding = encoding 

        self._source = source 

        # We don't know our length (number of tokens) yet. 

        self._len = None 

 

        self._fileid = fileid 

        self._stream = None 

 

        self._current_toknum = None 

        """This variable is set to the index of the next token that 

           will be read, immediately before ``self.read_block()`` is 

           called.  This is provided for the benefit of the block 

           reader, which under rare circumstances may need to know 

           the current token number.""" 

 

        self._current_blocknum = None 

        """This variable is set to the index of the next block that 

           will be read, immediately before ``self.read_block()`` is 

           called.  This is provided for the benefit of the block 

           reader, which under rare circumstances may need to know 

           the current block number.""" 

 

        # Find the length of the file. 

        try: 

            if isinstance(self._fileid, PathPointer): 

                self._eofpos = self._fileid.file_size() 

            else: 

                self._eofpos = os.stat(self._fileid).st_size 

        except Exception as exc: 

            raise ValueError('Unable to open or access %r -- %s' % 

                             (fileid, exc)) 

 

        # Maintain a cache of the most recently read block, to 

        # increase efficiency of random access. 

        self._cache = (-1, -1, None) 

 

    fileid = property(lambda self: self._fileid, doc=""" 

        The fileid of the file that is accessed by this view. 

 

        :type: str or PathPointer""") 

 

    def read_block(self, stream): 

        """ 

        Read a block from the input stream. 

 

        :return: a block of tokens from the input stream 

        :rtype: list(any) 

        :param stream: an input stream 

        :type stream: stream 

        """ 

        raise NotImplementedError('Abstract Method') 

 

    def _open(self): 

        """ 

        Open the file stream associated with this corpus view.  This 

        will be called performed if any value is read from the view 

        while its file stream is closed. 

        """ 

        if isinstance(self._fileid, PathPointer): 

            self._stream = self._fileid.open(self._encoding) 

        elif self._encoding: 

            self._stream = SeekableUnicodeStreamReader( 

                open(self._fileid, 'rb'), self._encoding) 

        else: 

            self._stream = open(self._fileid, 'rb') 

        if self._source is not None: 

            self._stream = SourcedStringStream(self._stream, self._source) 

 

    def close(self): 

        """ 

        Close the file stream associated with this corpus view.  This 

        can be useful if you are worried about running out of file 

        handles (although the stream should automatically be closed 

        upon garbage collection of the corpus view).  If the corpus 

        view is accessed after it is closed, it will be automatically 

        re-opened. 

        """ 

        if self._stream is not None: 

            self._stream.close() 

        self._stream = None 

 

    def __len__(self): 

        if self._len is None: 

            # iterate_from() sets self._len when it reaches the end 

            # of the file: 

            for tok in self.iterate_from(self._toknum[-1]): pass 

        return self._len 

 

    def __getitem__(self, i): 

        if isinstance(i, slice): 

            start, stop = slice_bounds(self, i) 

            # Check if it's in the cache. 

            offset = self._cache[0] 

            if offset <= start and stop <= self._cache[1]: 

                return self._cache[2][start-offset:stop-offset] 

            # Construct & return the result. 

            return LazySubsequence(self, start, stop) 

        else: 

            # Handle negative indices 

            if i < 0: i += len(self) 

            if i < 0: raise IndexError('index out of range') 

            # Check if it's in the cache. 

            offset = self._cache[0] 

            if offset <= i < self._cache[1]: 

                return self._cache[2][i-offset] 

            # Use iterate_from to extract it. 

            try: 

                return next(self.iterate_from(i)) 

            except StopIteration: 

                raise IndexError('index out of range') 

 

    # If we wanted to be thread-safe, then this method would need to 

    # do some locking. 

    def iterate_from(self, start_tok): 

        # Start by feeding from the cache, if possible. 

        if self._cache[0] <= start_tok < self._cache[1]: 

            for tok in self._cache[2][start_tok-self._cache[0]:]: 

                yield tok 

                start_tok += 1 

 

        # Decide where in the file we should start.  If `start` is in 

        # our mapping, then we can jump straight to the correct block; 

        # otherwise, start at the last block we've processed. 

        if start_tok < self._toknum[-1]: 

            block_index = bisect.bisect_right(self._toknum, start_tok)-1 

            toknum = self._toknum[block_index] 

            filepos = self._filepos[block_index] 

        else: 

            block_index = len(self._toknum)-1 

            toknum = self._toknum[-1] 

            filepos = self._filepos[-1] 

 

        # Open the stream, if it's not open already. 

        if self._stream is None: 

            self._open() 

 

        # Each iteration through this loop, we read a single block 

        # from the stream. 

        while filepos < self._eofpos: 

            # Read the next block. 

            self._stream.seek(filepos) 

            self._current_toknum = toknum 

            self._current_blocknum = block_index 

            tokens = self.read_block(self._stream) 

            assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 

                'block reader %s() should return list or tuple.' % 

                self.read_block.__name__) 

            num_toks = len(tokens) 

            new_filepos = self._stream.tell() 

            assert new_filepos > filepos, ( 

                'block reader %s() should consume at least 1 byte (filepos=%d)' % 

                (self.read_block.__name__, filepos)) 

 

            # Update our cache. 

            self._cache = (toknum, toknum+num_toks, list(tokens)) 

 

            # Update our mapping. 

            assert toknum <= self._toknum[-1] 

            if num_toks > 0: 

                block_index += 1 

                if toknum == self._toknum[-1]: 

                    assert new_filepos > self._filepos[-1] # monotonic! 

                    self._filepos.append(new_filepos) 

                    self._toknum.append(toknum+num_toks) 

                else: 

                    # Check for consistency: 

                    assert new_filepos == self._filepos[block_index], ( 

                        'inconsistent block reader (num chars read)') 

                    assert toknum+num_toks == self._toknum[block_index], ( 

                        'inconsistent block reader (num tokens returned)') 

 

            # If we reached the end of the file, then update self._len 

            if new_filepos == self._eofpos: 

                self._len = toknum + num_toks 

            # Generate the tokens in this block (but skip any tokens 

            # before start_tok).  Note that between yields, our state 

            # may be modified. 

            for tok in tokens[max(0, start_tok-toknum):]: 

                yield tok 

            # If we're at the end of the file, then we're done. 

            assert new_filepos <= self._eofpos 

            if new_filepos == self._eofpos: 

                break 

            # Update our indices 

            toknum += num_toks 

            filepos = new_filepos 

 

        # If we reach this point, then we should know our length. 

        assert self._len is not None 

 

    # Use concat for these, so we can use a ConcatenatedCorpusView 

    # when possible. 

    def __add__(self, other): 

        return concat([self, other]) 

    def __radd__(self, other): 

        return concat([other, self]) 

    def __mul__(self, count): 

        return concat([self] * count) 

    def __rmul__(self, count): 

        return concat([self] * count) 

 

class ConcatenatedCorpusView(AbstractLazySequence): 

    """ 

    A 'view' of a corpus file that joins together one or more 

    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most 

    one file handle is left open at any time. 

    """ 

    def __init__(self, corpus_views): 

        self._pieces = corpus_views 

        """A list of the corpus subviews that make up this 

        concatenation.""" 

 

        self._offsets = [0] 

        """A list of offsets, indicating the index at which each 

        subview begins.  In particular:: 

            offsets[i] = sum([len(p) for p in pieces[:i]])""" 

 

        self._open_piece = None 

        """The most recently accessed corpus subview (or None). 

        Before a new subview is accessed, this subview will be closed.""" 

 

    def __len__(self): 

        if len(self._offsets) <= len(self._pieces): 

            # Iterate to the end of the corpus. 

            for tok in self.iterate_from(self._offsets[-1]): pass 

 

        return self._offsets[-1] 

 

    def close(self): 

        for piece in self._pieces: 

            piece.close() 

 

    def iterate_from(self, start_tok): 

        piecenum = bisect.bisect_right(self._offsets, start_tok)-1 

 

        while piecenum < len(self._pieces): 

            offset = self._offsets[piecenum] 

            piece = self._pieces[piecenum] 

 

            # If we've got another piece open, close it first. 

            if self._open_piece is not piece: 

                if self._open_piece is not None: 

                    self._open_piece.close() 

                self._open_piece = piece 

 

            # Get everything we can from this piece. 

            for tok in piece.iterate_from(max(0, start_tok-offset)): 

                yield tok 

 

            # Update the offset table. 

            if piecenum+1 == len(self._offsets): 

                self._offsets.append(self._offsets[-1] + len(piece)) 

 

            # Move on to the next piece. 

            piecenum += 1 

 

def concat(docs): 

    """ 

    Concatenate together the contents of multiple documents from a 

    single corpus, using an appropriate concatenation function.  This 

    utility function is used by corpus readers when the user requests 

    more than one document at a time. 

    """ 

    if len(docs) == 1: 

        return docs[0] 

    if len(docs) == 0: 

        raise ValueError('concat() expects at least one object!') 

 

    types = set([d.__class__ for d in docs]) 

 

    # If they're all strings, use string concatenation. 

    if all(isinstance(doc, compat.string_types) for doc in docs): 

        return ''.join(docs) 

 

    # If they're all corpus views, then use ConcatenatedCorpusView. 

    for typ in types: 

        if not issubclass(typ, (StreamBackedCorpusView, 

                                ConcatenatedCorpusView)): 

            break 

    else: 

        return ConcatenatedCorpusView(docs) 

 

    # If they're all lazy sequences, use a lazy concatenation 

    for typ in types: 

        if not issubclass(typ, AbstractLazySequence): 

            break 

    else: 

        return LazyConcatenation(docs) 

 

    # Otherwise, see what we can do: 

    if len(types) == 1: 

        typ = list(types)[0] 

 

        if issubclass(typ, list): 

            return reduce((lambda a,b:a+b), docs, []) 

 

        if issubclass(typ, tuple): 

            return reduce((lambda a,b:a+b), docs, ()) 

 

        if ElementTree.iselement(typ): 

            xmltree = ElementTree.Element('documents') 

            for doc in docs: xmltree.append(doc) 

            return xmltree 

 

    # No method found! 

    raise ValueError("Don't know how to concatenate types: %r" % types) 

 

###################################################################### 

#{ Corpus View for Pickled Sequences 

###################################################################### 

 

class PickleCorpusView(StreamBackedCorpusView): 

    """ 

    A stream backed corpus view for corpus files that consist of 

    sequences of serialized Python objects (serialized using 

    ``pickle.dump``).  One use case for this class is to store the 

    result of running feature detection on a corpus to disk.  This can 

    be useful when performing feature detection is expensive (so we 

    don't want to repeat it); but the corpus is too large to store in 

    memory.  The following example illustrates this technique: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.corpus.reader.util import PickleCorpusView 

        >>> from nltk.util import LazyMap 

        >>> feature_corpus = LazyMap(detect_features, corpus) 

        >>> PickleCorpusView.write(feature_corpus, some_fileid) 

        >>> pcv = PickleCorpusView(some_fileid) 

    """ 

    BLOCK_SIZE = 100 

    PROTOCOL = -1 

 

    def __init__(self, fileid, delete_on_gc=False): 

        """ 

        Create a new corpus view that reads the pickle corpus 

        ``fileid``. 

 

        :param delete_on_gc: If true, then ``fileid`` will be deleted 

            whenever this object gets garbage-collected. 

        """ 

        self._delete_on_gc = delete_on_gc 

        StreamBackedCorpusView.__init__(self, fileid) 

 

    def read_block(self, stream): 

        result = [] 

        for i in range(self.BLOCK_SIZE): 

            try: result.append(pickle.load(stream)) 

            except EOFError: break 

        return result 

 

    def __del__(self): 

        """ 

        If ``delete_on_gc`` was set to true when this 

        ``PickleCorpusView`` was created, then delete the corpus view's 

        fileid.  (This method is called whenever a 

        ``PickledCorpusView`` is garbage-collected. 

        """ 

        if getattr(self, '_delete_on_gc'): 

            if os.path.exists(self._fileid): 

                try: os.remove(self._fileid) 

                except (OSError, IOError): pass 

        self.__dict__.clear() # make the garbage collector's job easier 

 

    @classmethod 

    def write(cls, sequence, output_file): 

        if isinstance(output_file, compat.string_types): 

            output_file = open(output_file, 'wb') 

        for item in sequence: 

            pickle.dump(item, output_file, cls.PROTOCOL) 

 

    @classmethod 

    def cache_to_tempfile(cls, sequence, delete_on_gc=True): 

        """ 

        Write the given sequence to a temporary file as a pickle 

        corpus; and then return a ``PickleCorpusView`` view for that 

        temporary corpus file. 

 

        :param delete_on_gc: If true, then the temporary file will be 

            deleted whenever this object gets garbage-collected. 

        """ 

        try: 

            fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-') 

            output_file = os.fdopen(fd, 'wb') 

            cls.write(sequence, output_file) 

            output_file.close() 

            return PickleCorpusView(output_file_name, delete_on_gc) 

        except (OSError, IOError) as e: 

            raise ValueError('Error while creating temp file: %s' % e) 

 

 

 

###################################################################### 

#{ Block Readers 

###################################################################### 

 

def read_whitespace_block(stream): 

    toks = [] 

    for i in range(20): # Read 20 lines at a time. 

        toks.extend(stream.readline().split()) 

    return toks 

 

def read_wordpunct_block(stream): 

    toks = [] 

    for i in range(20): # Read 20 lines at a time. 

        toks.extend(wordpunct_tokenize(stream.readline())) 

    return toks 

 

def read_line_block(stream): 

    toks = [] 

    for i in range(20): 

        line = stream.readline() 

        if not line: return toks 

        toks.append(line.rstrip('\n')) 

    return toks 

 

def read_blankline_block(stream): 

    s = '' 

    while True: 

        line = stream.readline() 

        # End of file: 

        if not line: 

            if s: return [s] 

            else: return [] 

        # Blank line: 

        elif line and not line.strip(): 

            if s: return [s] 

        # Other line: 

        else: 

            s += line 

 

def read_alignedsent_block(stream): 

    s = '' 

    while True: 

        line = stream.readline() 

        if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n': 

            continue 

        # End of file: 

        if not line: 

            if s: return [s] 

            else: return [] 

        # Other line: 

        else: 

            s += line 

            if re.match('^\d+-\d+', line) is not None: 

                return [s] 

 

def read_regexp_block(stream, start_re, end_re=None): 

    """ 

    Read a sequence of tokens from a stream, where tokens begin with 

    lines that match ``start_re``.  If ``end_re`` is specified, then 

    tokens end with lines that match ``end_re``; otherwise, tokens end 

    whenever the next line matching ``start_re`` or EOF is found. 

    """ 

    # Scan until we find a line matching the start regexp. 

    while True: 

        line = stream.readline() 

        if not line: return [] # end of file. 

        if re.match(start_re, line): break 

 

    # Scan until we find another line matching the regexp, or EOF. 

    lines = [line] 

    while True: 

        oldpos = stream.tell() 

        line = stream.readline() 

        # End of file: 

        if not line: 

            return [''.join(lines)] 

        # End of token: 

        if end_re is not None and re.match(end_re, line): 

            return [''.join(lines)] 

        # Start of new token: backup to just before it starts, and 

        # return the token we've already collected. 

        if end_re is None and re.match(start_re, line): 

            stream.seek(oldpos) 

            return [''.join(lines)] 

        # Anything else is part of the token. 

        lines.append(line) 

 

def read_sexpr_block(stream, block_size=16384, comment_char=None): 

    """ 

    Read a sequence of s-expressions from the stream, and leave the 

    stream's file position at the end the last complete s-expression 

    read.  This function will always return at least one s-expression, 

    unless there are no more s-expressions in the file. 

 

    If the file ends in in the middle of an s-expression, then that 

    incomplete s-expression is returned when the end of the file is 

    reached. 

 

    :param block_size: The default block size for reading.  If an 

        s-expression is longer than one block, then more than one 

        block will be read. 

    :param comment_char: A character that marks comments.  Any lines 

        that begin with this character will be stripped out. 

        (If spaces or tabs precede the comment character, then the 

        line will not be stripped.) 

    """ 

    start = stream.tell() 

    block = stream.read(block_size) 

    encoding = getattr(stream, 'encoding', None) 

    assert encoding is not None or isinstance(block, str) 

    if encoding not in (None, 'utf-8'): 

        import warnings 

        warnings.warn('Parsing may fail, depending on the properties ' 

                      'of the %s encoding!' % encoding) 

        # (e.g., the utf-16 encoding does not work because it insists 

        # on adding BOMs to the beginning of encoded strings.) 

 

    if comment_char: 

        COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char)) 

    while True: 

        try: 

            # If we're stripping comments, then make sure our block ends 

            # on a line boundary; and then replace any comments with 

            # space characters.  (We can't just strip them out -- that 

            # would make our offset wrong.) 

            if comment_char: 

                block += stream.readline() 

                block = re.sub(COMMENT, _sub_space, block) 

            # Read the block. 

            tokens, offset = _parse_sexpr_block(block) 

            # Skip whitespace 

            offset = re.compile(r'\s*').search(block, offset).end() 

 

            # Move to the end position. 

            if encoding is None: 

                stream.seek(start+offset) 

            else: 

                stream.seek(start+len(block[:offset].encode(encoding))) 

 

            # Return the list of tokens we processed 

            return tokens 

        except ValueError as e: 

            if e.args[0] == 'Block too small': 

                next_block = stream.read(block_size) 

                if next_block: 

                    block += next_block 

                    continue 

                else: 

                    # The file ended mid-sexpr -- return what we got. 

                    return [block.strip()] 

            else: raise 

 

def _sub_space(m): 

    """Helper function: given a regexp match, return a string of 

    spaces that's the same length as the matched string.""" 

    return ' '*(m.end()-m.start()) 

 

def _parse_sexpr_block(block): 

    tokens = [] 

    start = end = 0 

 

    while end < len(block): 

        m = re.compile(r'\S').search(block, end) 

        if not m: 

            return tokens, end 

 

        start = m.start() 

 

        # Case 1: sexpr is not parenthesized. 

        if m.group() != '(': 

            m2 = re.compile(r'[\s(]').search(block, start) 

            if m2: 

                end = m2.start() 

            else: 

                if tokens: return tokens, end 

                raise ValueError('Block too small') 

 

        # Case 2: parenthesized sexpr. 

        else: 

            nesting = 0 

            for m in re.compile(r'[()]').finditer(block, start): 

                if m.group()=='(': nesting += 1 

                else: nesting -= 1 

                if nesting == 0: 

                    end = m.end() 

                    break 

            else: 

                if tokens: return tokens, end 

                raise ValueError('Block too small') 

 

        tokens.append(block[start:end]) 

 

    return tokens, end 

 

 

###################################################################### 

#{ Finding Corpus Items 

###################################################################### 

 

def find_corpus_fileids(root, regexp): 

    if not isinstance(root, PathPointer): 

        raise TypeError('find_corpus_fileids: expected a PathPointer') 

    regexp += '$' 

 

    # Find fileids in a zipfile: scan the zipfile's namelist.  Filter 

    # out entries that end in '/' -- they're directories. 

    if isinstance(root, ZipFilePathPointer): 

        fileids = [name[len(root.entry):] for name in root.zipfile.namelist() 

                 if not name.endswith('/')] 

        items = [name for name in fileids if re.match(regexp, name)] 

        return sorted(items) 

 

    # Find fileids in a directory: use os.walk to search all 

    # subdirectories, and match paths against the regexp. 

    elif isinstance(root, FileSystemPathPointer): 

        items = [] 

        for dirname, subdirs, fileids in os.walk(root.path): 

            prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) 

            items += [prefix+fileid for fileid in fileids 

                      if re.match(regexp, prefix+fileid)] 

            # Don't visit svn directories: 

            if '.svn' in subdirs: subdirs.remove('.svn') 

        return sorted(items) 

 

    else: 

        raise AssertionError("Don't know how to handle %r" % root) 

 

def _path_from(parent, child): 

    if os.path.split(parent)[1] == '': 

        parent = os.path.split(parent)[0] 

    path = [] 

    while parent != child: 

        child, dirname = os.path.split(child) 

        path.insert(0, dirname) 

        assert os.path.split(child)[0] != child 

    return path 

 

###################################################################### 

#{ Paragraph structure in Treebank files 

###################################################################### 

 

def tagged_treebank_para_block_reader(stream): 

    # Read the next paragraph. 

    para = '' 

    while True: 

        line = stream.readline() 

        # End of paragraph: 

        if re.match('======+\s*$', line): 

            if para.strip(): return [para] 

        # End of file: 

        elif line == '': 

            if para.strip(): return [para] 

            else: return [] 

        # Content line: 

        else: 

            para += line