Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

# -*- coding: utf-8 -*- 

# Natural Language Toolkit: Interface to the HunPos POS-tagger 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se> 

#         David Nemeskey <nemeskeyd@gmail.com> (modifications) 

#         Attila Zseder <zseder@gmail.com> (modifications) 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A module for interfacing with the HunPos open-source POS-tagger. 

""" 

 

import os 

from subprocess import Popen, PIPE 

 

from nltk.internals import find_binary, find_file 

from nltk.tag.api import TaggerI 

 

_hunpos_url = 'http://code.google.com/p/hunpos/' 

 

_hunpos_charset = 'ISO-8859-1' 

"""The default encoding used by hunpos: ISO-8859-1.""" 

 

class HunposTagger(TaggerI): 

    """ 

    A class for pos tagging with HunPos. The input is the paths to: 

     - a model trained on training data 

     - (optionally) the path to the hunpos-tag binary 

     - (optionally) the encoding of the training data (default: ISO-8859-1) 

 

    Example: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.tag.hunpos import HunposTagger 

        >>> ht = HunposTagger('english.model') 

        >>> ht.tag('What is the airspeed of an unladen swallow ?'.split()) 

        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] 

        >>> ht.close() 

 

    This class communicates with the hunpos-tag binary via pipes. When the 

    tagger object is no longer needed, the close() method should be called to 

    free system resources. The class supports the context manager interface; if 

    used in a with statement, the close() method is invoked automatically: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> with HunposTagger('english.model') as ht: 

        ...     ht.tag('What is the airspeed of an unladen swallow ?'.split()) 

        ... 

        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] 

    """ 

 

    def __init__(self, path_to_model, path_to_bin=None, 

                 encoding=_hunpos_charset, verbose=False): 

        """ 

        Starts the hunpos-tag executable and establishes a connection with it. 

 

        :param path_to_model: The model file. 

        :param path_to_bin: The hunpos-tag binary. 

        :param encoding: The encoding used by the model. Unicode tokens 

            passed to the tag() and batch_tag() methods are converted to 

            this charset when they are sent to hunpos-tag. 

            The default is ISO-8859-1 (Latin-1). 

 

            This parameter is ignored for str tokens, which are sent as-is. 

            The caller must ensure that tokens are encoded in the right charset. 

        """ 

        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin', 

                        '/Applications/bin', '~/bin', '~/Applications/bin'] 

        hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) 

 

        self._hunpos_bin = find_binary( 

                'hunpos-tag', path_to_bin, 

                env_vars=('HUNPOS', 'HUNPOS_HOME'), 

                searchpath=hunpos_paths, 

                url=_hunpos_url, 

                verbose=verbose) 

 

        self._hunpos_model = find_file(path_to_model, 

                env_vars=('HUNPOS', 'HUNPOS_HOME'), verbose=verbose) 

        self._encoding = encoding 

        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model], 

                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) 

        self._closed = False 

 

    def __del__(self): 

        self.close() 

 

    def close(self): 

        """Closes the pipe to the hunpos executable.""" 

        if not self._closed: 

            self._hunpos.communicate() 

            self._closed = True 

 

    def __enter__(self): 

        return self 

    def __exit__(self, exc_type, exc_value, traceback): 

        self.close() 

 

    def tag(self, tokens): 

        """Tags a single sentence: a list of words. 

        The tokens should not contain any newline characters. 

        """ 

        for token in tokens: 

            assert "\n" not in token, "Tokens should not contain newlines" 

            if isinstance(token, unicode): 

                token = token.encode(self._encoding) 

            self._hunpos.stdin.write(token + "\n") 

        # We write a final empty line to tell hunpos that the sentence is finished: 

        self._hunpos.stdin.write("\n") 

        self._hunpos.stdin.flush() 

 

        tagged_tokens = [] 

        for token in tokens: 

            tagged = self._hunpos.stdout.readline().strip().split("\t") 

            if len(tagged) > 1: 

                tag = tagged[1] 

            else: 

                tag = None 

            tagged_tokens.append((token, tag)) 

        # We have to read (and dismiss) the final empty line: 

        self._hunpos.stdout.readline() 

 

        return tagged_tokens 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)