Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

# Natural Language Toolkit: String Category Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Read tuples from a corpus consisting of categorized strings. 

For example, from the question classification corpus: 

 

NUM:dist How far is it from Denver to Aspen ? 

LOC:city What county is Modesto , California in ? 

HUM:desc Who was Galileo ? 

DESC:def What is an atom ? 

NUM:date When did Hawaii become a state ? 

""" 

 

# based on PPAttachmentCorpusReader 

 

import os 

 

from nltk import compat 

from .util import * 

from .api import * 

 

# [xx] Should the order of the tuple be reversed -- in most other places 

# in nltk, we use the form (data, tag) -- e.g., tagged words and 

# labeled texts for classifiers. 

class StringCategoryCorpusReader(CorpusReader): 

    def __init__(self, root, fileids, delimiter=' ', encoding=None): 

        """ 

        :param root: The root directory for this corpus. 

        :param fileids: A list or regexp specifying the fileids in this corpus. 

        :param delimiter: Field delimiter 

        """ 

        CorpusReader.__init__(self, root, fileids, encoding) 

        self._delimiter = delimiter 

 

    def tuples(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, 

                                              encoding=enc) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def raw(self, fileids=None): 

        """ 

        :return: the text contents of the given fileids, as a single string. 

        """ 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

    def _read_tuple_block(self, stream): 

        line = stream.readline().strip() 

        if line: 

            return [tuple(line.split(self._delimiter, 1))] 

        else: 

            return []