Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

# Natural Language Toolkit: XML Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Corpus reader for corpora whose documents are xml files. 

 

(note -- not named 'xml' to avoid conflicting w/ standard xml package) 

""" 

from __future__ import print_function 

 

import codecs 

 

# Use the c version of ElementTree, which is faster, if possible: 

try: from xml.etree import cElementTree as ElementTree 

except ImportError: from xml.etree import ElementTree 

 

from nltk import compat 

from nltk.data import SeekableUnicodeStreamReader 

from nltk.tokenize import WordPunctTokenizer 

from nltk.internals import ElementWrapper 

 

from nltk.corpus.reader.api import CorpusReader 

from nltk.corpus.reader.util import * 

 

class XMLCorpusReader(CorpusReader): 

    """ 

    Corpus reader for corpora whose documents are xml files. 

 

    Note that the ``XMLCorpusReader`` constructor does not take an 

    ``encoding`` argument, because the unicode encoding is specified by 

    the XML files themselves.  See the XML specs for more info. 

    """ 

    def __init__(self, root, fileids, wrap_etree=False): 

        self._wrap_etree = wrap_etree 

        CorpusReader.__init__(self, root, fileids) 

 

    def xml(self, fileid=None): 

        # Make sure we have exactly one file -- no concatenating XML. 

        if fileid is None and len(self._fileids) == 1: 

            fileid = self._fileids[0] 

        if not isinstance(fileid, compat.string_types): 

            raise TypeError('Expected a single file identifier string') 

        # Read the XML in using ElementTree. 

        elt = ElementTree.parse(self.abspath(fileid).open()).getroot() 

        # If requested, wrap it. 

        if self._wrap_etree: 

            elt = ElementWrapper(elt) 

        # Return the ElementTree element. 

        return elt 

 

    def words(self, fileid=None): 

        """ 

        Returns all of the words and punctuation symbols in the specified file 

        that were in text nodes -- ie, tags are ignored. Like the xml() method, 

        fileid can only specify one file. 

 

        :return: the given file's text nodes as a list of words and punctuation symbols 

        :rtype: list(str) 

        """ 

 

        elt = self.xml(fileid) 

        word_tokenizer=WordPunctTokenizer() 

        iterator = elt.getiterator() 

        out = [] 

 

        for node in iterator: 

            text = node.text 

            if text is not None: 

                toks = word_tokenizer.tokenize(text) 

                out.extend(toks) 

        return out 

 

    def raw(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

 

class XMLCorpusView(StreamBackedCorpusView): 

    """ 

    A corpus view that selects out specified elements from an XML 

    file, and provides a flat list-like interface for accessing them. 

    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, 

    but may be used by subclasses of ``XMLCorpusReader``.) 

 

    Every XML corpus view has a "tag specification", indicating what 

    XML elements should be included in the view; and each (non-nested) 

    element that matches this specification corresponds to one item in 

    the view.  Tag specifications are regular expressions over tag 

    paths, where a tag path is a list of element tag names, separated 

    by '/', indicating the ancestry of the element.  Some examples: 

 

      - ``'foo'``: A top-level element whose tag is ``foo``. 

      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent 

        is a top-level element whose tag is ``foo``. 

      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere 

        in the xml tree. 

      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, 

        appearing anywhere in the xml tree. 

 

    The view items are generated from the selected XML elements via 

    the method ``handle_elt()``.  By default, this method returns the 

    element as-is (i.e., as an ElementTree object); but it can be 

    overridden, either via subclassing or via the ``elt_handler`` 

    constructor parameter. 

    """ 

 

    #: If true, then display debugging output to stdout when reading 

    #: blocks. 

    _DEBUG = False 

 

    #: The number of characters read at a time by this corpus reader. 

    _BLOCK_SIZE = 1024 

 

    def __init__(self, fileid, tagspec, elt_handler=None): 

        """ 

        Create a new corpus view based on a specified XML file. 

 

        Note that the ``XMLCorpusView`` constructor does not take an 

        ``encoding`` argument, because the unicode encoding is 

        specified by the XML files themselves. 

 

        :type tagspec: str 

        :param tagspec: A tag specification, indicating what XML 

            elements should be included in the view.  Each non-nested 

            element that matches this specification corresponds to one 

            item in the view. 

 

        :param elt_handler: A function used to transform each element 

            to a value for the view.  If no handler is specified, then 

            ``self.handle_elt()`` is called, which returns the element 

            as an ElementTree object.  The signature of elt_handler is:: 

 

                elt_handler(elt, tagspec) -> value 

        """ 

        if elt_handler: self.handle_elt = elt_handler 

 

        self._tagspec = re.compile(tagspec+r'\Z') 

        """The tag specification for this corpus view.""" 

 

        self._tag_context = {0: ()} 

        """A dictionary mapping from file positions (as returned by 

           ``stream.seek()`` to XML contexts.  An XML context is a 

           tuple of XML tag names, indicating which tags have not yet 

           been closed.""" 

 

        encoding = self._detect_encoding(fileid) 

        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) 

 

    def _detect_encoding(self, fileid): 

        if isinstance(fileid, PathPointer): 

            s = fileid.open().readline() 

        else: 

            s = open(fileid, 'rb').readline() 

        if s.startswith(codecs.BOM_UTF16_BE): 

            return 'utf-16-be' 

        if s.startswith(codecs.BOM_UTF16_LE): 

            return 'utf-16-le' 

        if s.startswith(codecs.BOM_UTF32_BE): 

            return 'utf-32-be' 

        if s.startswith(codecs.BOM_UTF32_LE): 

            return 'utf-32-le' 

        if s.startswith(codecs.BOM_UTF8): 

            return 'utf-8' 

        m = re.match(r'\s*<?xml\b.*\bencoding="([^"]+)"', s) 

        if m: return m.group(1) 

        m = re.match(r"\s*<?xml\b.*\bencoding='([^']+)'", s) 

        if m: return m.group(1) 

        # No encoding found -- what should the default be? 

        return 'utf-8' 

 

    def handle_elt(self, elt, context): 

        """ 

        Convert an element into an appropriate value for inclusion in 

        the view.  Unless overridden by a subclass or by the 

        ``elt_handler`` constructor argument, this method simply 

        returns ``elt``. 

 

        :return: The view value corresponding to ``elt``. 

 

        :type elt: ElementTree 

        :param elt: The element that should be converted. 

 

        :type context: str 

        :param context: A string composed of element tags separated by 

            forward slashes, indicating the XML context of the given 

            element.  For example, the string ``'foo/bar/baz'`` 

            indicates that the element is a ``baz`` element whose 

            parent is a ``bar`` element and whose grandparent is a 

            top-level ``foo`` element. 

        """ 

        return elt 

 

    #: A regular expression that matches XML fragments that do not 

    #: contain any un-closed tags. 

    _VALID_XML_RE = re.compile(r""" 

        [^<]* 

        ( 

          ((<!--.*?-->)                         |  # comment 

           (<![CDATA[.*?]])                     |  # raw character data 

           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl 

           (<[^>]*>))                              # tag or PI 

          [^<]*)* 

        \Z""", 

        re.DOTALL|re.VERBOSE) 

 

    #: A regular expression used to extract the tag name from a start tag, 

    #: end tag, or empty-elt tag string. 

    _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') 

 

    #: A regular expression used to find all start-tags, end-tags, and 

    #: emtpy-elt tags in an XML file.  This regexp is more lenient than 

    #: the XML spec -- e.g., it allows spaces in some places where the 

    #: spec does not. 

    _XML_PIECE = re.compile(r""" 

        # Include these so we can skip them: 

        (?P<COMMENT>        <!--.*?-->                          )| 

        (?P<CDATA>          <![CDATA[.*?]]>                     )| 

        (?P<PI>             <\?.*?\?>                           )| 

        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>  )| 

        # These are the ones we actually care about: 

        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )| 

        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )| 

        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )""", 

        re.DOTALL|re.VERBOSE) 

 

    def _read_xml_fragment(self, stream): 

        """ 

        Read a string from the given stream that does not contain any 

        un-closed tags.  In particular, this function first reads a 

        block from the stream of size ``self._BLOCK_SIZE``.  It then 

        checks if that block contains an un-closed tag.  If it does, 

        then this function either backtracks to the last '<', or reads 

        another block. 

        """ 

        fragment = '' 

 

        while True: 

            if isinstance(stream, SeekableUnicodeStreamReader): 

                startpos = stream.tell() 

            # Read a block and add it to the fragment. 

            xml_block = stream.read(self._BLOCK_SIZE) 

            fragment += xml_block 

 

            # Do we have a well-formed xml fragment? 

            if self._VALID_XML_RE.match(fragment): 

                return fragment 

 

            # Do we have a fragment that will never be well-formed? 

            if re.search('[<>]', fragment).group(0) == '>': 

                pos = stream.tell() - ( 

                    len(fragment)-re.search('[<>]', fragment).end()) 

                raise ValueError('Unexpected ">" near char %s' % pos) 

 

            # End of file? 

            if not xml_block: 

                raise ValueError('Unexpected end of file: tag not closed') 

 

            # If not, then we must be in the middle of a <..tag..>. 

            # If appropriate, backtrack to the most recent '<' 

            # character. 

            last_open_bracket = fragment.rfind('<') 

            if last_open_bracket > 0: 

                if self._VALID_XML_RE.match(fragment[:last_open_bracket]): 

                    if isinstance(stream, SeekableUnicodeStreamReader): 

                        stream.seek(startpos) 

                        stream.char_seek_forward(last_open_bracket) 

                    else: 

                        stream.seek(-(len(fragment)-last_open_bracket), 1) 

                    return fragment[:last_open_bracket] 

 

            # Otherwise, read another block. (i.e., return to the 

            # top of the loop.) 

 

    def read_block(self, stream, tagspec=None, elt_handler=None): 

        """ 

        Read from ``stream`` until we find at least one element that 

        matches ``tagspec``, and return the result of applying 

        ``elt_handler`` to each element found. 

        """ 

        if tagspec is None: tagspec = self._tagspec 

        if elt_handler is None: elt_handler = self.handle_elt 

 

        # Use a stack of strings to keep track of our context: 

        context = list(self._tag_context.get(stream.tell())) 

        assert context is not None # check this -- could it ever happen? 

 

        elts = [] 

 

        elt_start = None # where does the elt start 

        elt_depth = None # what context depth 

        elt_text = '' 

 

        while elts==[] or elt_start is not None: 

            if isinstance(stream, SeekableUnicodeStreamReader): 

                startpos = stream.tell() 

            xml_fragment = self._read_xml_fragment(stream) 

 

            # End of file. 

            if not xml_fragment: 

                if elt_start is None: break 

                else: raise ValueError('Unexpected end of file') 

 

            # Process each <tag> in the xml fragment. 

            for piece in self._XML_PIECE.finditer(xml_fragment): 

                if self._DEBUG: 

                    print('%25s %s' % ('/'.join(context)[-20:], piece.group())) 

 

                if piece.group('START_TAG'): 

                    name = self._XML_TAG_NAME.match(piece.group()).group(1) 

                    # Keep context up-to-date. 

                    context.append(name) 

                    # Is this one of the elts we're looking for? 

                    if elt_start is None: 

                        if re.match(tagspec, '/'.join(context)): 

                            elt_start = piece.start() 

                            elt_depth = len(context) 

 

                elif piece.group('END_TAG'): 

                    name = self._XML_TAG_NAME.match(piece.group()).group(1) 

                    # sanity checks: 

                    if not context: 

                        raise ValueError('Unmatched tag </%s>' % name) 

                    if name != context[-1]: 

                        raise ValueError('Unmatched tag <%s>...</%s>' % 

                                         (context[-1], name)) 

                    # Is this the end of an element? 

                    if elt_start is not None and elt_depth == len(context): 

                        elt_text += xml_fragment[elt_start:piece.end()] 

                        elts.append( (elt_text, '/'.join(context)) ) 

                        elt_start = elt_depth = None 

                        elt_text = '' 

                    # Keep context up-to-date 

                    context.pop() 

 

                elif piece.group('EMPTY_ELT_TAG'): 

                    name = self._XML_TAG_NAME.match(piece.group()).group(1) 

                    if elt_start is None: 

                        if re.match(tagspec, '/'.join(context)+'/'+name): 

                            elts.append((piece.group(), 

                                         '/'.join(context)+'/'+name)) 

 

            if elt_start is not None: 

                # If we haven't found any elements yet, then keep 

                # looping until we do. 

                if elts == []: 

                    elt_text += xml_fragment[elt_start:] 

                    elt_start = 0 

 

                # If we've found at least one element, then try 

                # backtracking to the start of the element that we're 

                # inside of. 

                else: 

                    # take back the last start-tag, and return what 

                    # we've gotten so far (elts is non-empty). 

                    if self._DEBUG: 

                        print(' '*36+'(backtrack)') 

                    if isinstance(stream, SeekableUnicodeStreamReader): 

                        stream.seek(startpos) 

                        stream.char_seek_forward(elt_start) 

                    else: 

                        stream.seek(-(len(xml_fragment)-elt_start), 1) 

                    context = context[:elt_depth-1] 

                    elt_start = elt_depth = None 

                    elt_text = '' 

 

        # Update the _tag_context dict. 

        pos = stream.tell() 

        if pos in self._tag_context: 

            assert tuple(context) == self._tag_context[pos] 

        else: 

            self._tag_context[pos] = tuple(context) 

 

        return [elt_handler(ElementTree.fromstring( 

                                  elt.encode('ascii', 'xmlcharrefreplace')), 

                            context) 

                for (elt, context) in elts]