Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Indian Language POS-Tagged Corpus 

Collected by A Kumaran, Microsoft Research, India 

Distributed with permission 

 

Contents: 

  - Bangla: IIT Kharagpur 

  - Hindi: Microsoft Research India 

  - Marathi: IIT Bombay 

  - Telugu: IIIT Hyderabad 

""" 

 

import codecs 

 

from nltk import compat 

from nltk.tag.util import str2tuple 

 

from .util import * 

from .api import * 

 

class IndianCorpusReader(CorpusReader): 

    """ 

    List of words, one per line.  Blank lines are ignored. 

    """ 

    def words(self, fileids=None): 

        return concat([IndianCorpusView(fileid, enc, 

                                        False, False) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def tagged_words(self, fileids=None, simplify_tags=False): 

        if simplify_tags: 

            tag_mapping_function = self._tag_mapping_function 

        else: 

            tag_mapping_function = None 

        return concat([IndianCorpusView(fileid, enc, 

                                        True, False, tag_mapping_function) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def sents(self, fileids=None): 

        return concat([IndianCorpusView(fileid, enc, 

                                        False, True) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def tagged_sents(self, fileids=None, simplify_tags=False): 

        if simplify_tags: 

            tag_mapping_function = self._tag_mapping_function 

        else: 

            tag_mapping_function = None 

        return concat([IndianCorpusView(fileid, enc, 

                                        True, True, tag_mapping_function) 

                       for (fileid, enc) in self.abspaths(fileids, True)]) 

 

    def raw(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

 

class IndianCorpusView(StreamBackedCorpusView): 

    def __init__(self, corpus_file, encoding, tagged, 

                 group_by_sent, tag_mapping_function=None): 

        self._tagged = tagged 

        self._group_by_sent = group_by_sent 

        self._tag_mapping_function = tag_mapping_function 

        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) 

 

    def read_block(self, stream): 

        line = stream.readline() 

        if line.startswith('<'): 

            return [] 

        sent = [str2tuple(word, sep='_') for word in line.split()] 

        if self._tag_mapping_function: 

            sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] 

        if not self._tagged: sent = [w for (w,t) in sent] 

        if self._group_by_sent: 

            return [sent] 

        else: 

            return sent