Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

# encoding: utf-8 

# Natural Language Toolkit: Interface to the Senna tagger 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A module for interfacing with the SENNA pipeline. 

""" 

 

from os import path, sep 

from subprocess import Popen, PIPE 

from platform import architecture, system 

from nltk.tag.api import TaggerI 

 

_senna_url = 'http://ml.nec-labs.com/senna/' 

 

 

class Error(Exception): 

    """Basic error handling class to be extended by the module specific 

    exceptions""" 

 

 

class ExecutableNotFound(Error): 

    """Raised if the senna executable does not exist""" 

 

 

class RunFailure(Error): 

    """Raised if the pipeline fails to execute""" 

 

 

class SentenceMisalignment(Error): 

    """Raised if the new sentence is shorter than the original one or the number 

    of sentences in the result is less than the input.""" 

 

 

class SennaTagger(TaggerI): 

    r""" 

    A general interface of the SENNA pipeline that supports any of the 

    operations specified in SUPPORTED_OPERATIONS. 

 

    Applying multiple operations at once has the speed advantage. For example, 

    senna v2.0 will calculate the POS tags in case you are extracting the named 

    entities. Applying both of the operations will cost only the time of 

    extracting the named entities. 

 

    SENNA pipeline has a fixed maximum size of the sentences that it can read. 

    By default it is 1024 token/sentence. If you have larger sentences, changing 

    the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your 

    system specific binary should be rebuilt. Otherwise this could introduce 

    misalignment errors. 

 

    The input is: 

    - path to the directory that contains SENNA executables. 

    - List of the operations needed to be performed. 

    - (optionally) the encoding of the input data (default:utf-8) 

 

    Example: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.tag.senna import SennaTagger 

        >>> pipeline = SennaTagger('/usr/share/senna-v2.0', ['pos', 'chk', 'ner']) 

        >>> sent = u'Düsseldorf is an international business center'.split() 

        >>> pipeline.tag(sent) 

        [{'word': u'D\xfcsseldorf', 'chk': u'B-NP', 'ner': u'B-PER', 'pos': u'NNP'}, 

        {'word': u'is', 'chk': u'B-VP', 'ner': u'O', 'pos': u'VBZ'}, 

        {'word': u'an', 'chk': u'B-NP', 'ner': u'O', 'pos': u'DT'}, 

        {'word': u'international', 'chk': u'I-NP', 'ner': u'O', 'pos': u'JJ'}, 

        {'word': u'business', 'chk': u'I-NP', 'ner': u'O', 'pos': u'NN'}, 

        {'word': u'center', 'chk': u'I-NP', 'ner': u'O','pos': u'NN'}] 

    """ 

 

    SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner'] 

 

    def __init__(self, senna_path, operations, encoding='utf-8'): 

        self._encoding = encoding 

        self._path = path.normpath(senna_path) + sep 

        self.operations = operations 

 

    @property 

    def executable(self): 

        """ 

        A property that determines the system specific binary that should be 

        used in the pipeline. In case, the system is not known the senna binary will 

        be used. 

        """ 

        os_name = system() 

        if os_name == 'Linux': 

            bits = architecture()[0] 

            if bits == '64bit': 

                return path.join(self._path, 'senna-linux64') 

            return path.join(self._path, 'senna-linux32') 

        if os_name == 'Windows': 

            return path.join(self._path, 'senna-win32.exe') 

        if os_name == 'Darwin': 

            return path.join(self._path, 'senna-osx') 

        return path.join(self._path, 'senna') 

 

    def _map(self): 

        """ 

        A method that calculates the order of the columns that SENNA pipeline 

        will output the tags into. This depends on the operations being ordered. 

        """ 

        _map = {} 

        i = 1 

        for operation in SennaTagger.SUPPORTED_OPERATIONS: 

            if operation in self.operations: 

                _map[operation] = i 

                i+= 1 

        return _map 

 

    def tag(self, tokens): 

        """ 

        Applies the specified operation(s) on a list of tokens. 

        """ 

        return self.batch_tag([tokens])[0] 

 

    def batch_tag(self, sentences): 

        """ 

        Applies the tag method over a list of sentences. This method will return a 

        list of dictionaries. Every dictionary will contain a word with its 

        calculated annotations/tags. 

        """ 

        encoding = self._encoding 

 

        # Verifies the existence of the executable 

        if not path.isfile(self.executable): 

          raise ExecutableNotFound("Senna executable expected at %s but not found" % 

                                   self.executable) 

 

        # Build the senna command to run the tagger 

        _senna_cmd = [self.executable, '-path', self._path, '-usrtokens', '-iobtags'] 

        _senna_cmd.extend(['-'+op for op in self.operations]) 

 

        # Serialize the actual sentences to a temporary string 

        _input = '\n'.join((' '.join(x) for x in sentences))+'\n' 

        if isinstance(_input, unicode) and encoding: 

            _input = _input.encode(encoding) 

 

        # Run the tagger and get the output 

        p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) 

        (stdout, stderr) = p.communicate(input=_input) 

        senna_output = stdout 

 

        # Check the return code. 

        if p.returncode != 0: 

            raise RunFailure('Senna command failed! Details: %s' % stderr) 

 

        if encoding: 

            senna_output = stdout.decode(encoding) 

 

        # Output the tagged sentences 

        map_ = self._map() 

        tagged_sentences = [[]] 

        sentence_index = 0 

        token_index = 0 

        for tagged_word in senna_output.strip().split("\n"): 

            if not tagged_word: 

                tagged_sentences.append([]) 

                sentence_index += 1 

                token_index = 0 

                continue 

            tags = tagged_word.split('\t') 

            result = {} 

            for tag in map_: 

              result[tag] = tags[map_[tag]].strip() 

            try: 

              result['word'] = sentences[sentence_index][token_index] 

            except IndexError: 

              raise SentenceMisalignment( 

                "Misalignment error occurred at sentence number %d. Possible reason" 

                " is that the sentence size exceeded the maximum size. Check the " 

                "documentation of SennaTagger class for more information." 

                % sentence_index) 

            tagged_sentences[-1].append(result) 

            token_index += 1 

        return tagged_sentences 

 

 

class POSTagger(SennaTagger): 

    """ 

    A Part of Speech tagger. 

 

    The input is: 

    - path to the directory that contains SENNA executables. 

    - (optionally) the encoding of the input data (default:utf-8) 

 

    Example: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.tag.senna import POSTagger 

        >>> postagger = POSTagger('/usr/share/senna-v2.0') 

        >>> postagger.tag('What is the airspeed of an unladen swallow ?'.split()) 

        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), 

        ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] 

    """ 

    def __init__(self, path, encoding='utf-8'): 

        super(POSTagger, self).__init__(path, ['pos'], encoding) 

 

    def batch_tag(self, sentences): 

        """ 

        Applies the tag method over a list of sentences. This method will return 

        for each sentence a list of tuples of (word, tag). 

        """ 

        tagged_sents = super(POSTagger, self).batch_tag(sentences) 

        for i in range(len(tagged_sents)): 

            for j in range(len(tagged_sents[i])): 

                annotations = tagged_sents[i][j] 

                tagged_sents[i][j] = (annotations['word'], annotations['pos']) 

        return tagged_sents 

 

 

class NERTagger(SennaTagger): 

    """ 

    A named entity extractor. 

 

    The input is: 

    - path to the directory that contains SENNA executables. 

    - (optionally) the encoding of the input data (default:utf-8) 

 

    Example: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.tag.senna import NERTagger 

        >>> nertagger = NERTagger('/usr/share/senna-v2.0') 

        >>> nertagger.tag('Shakespeare theatre was in London .'.split()) 

        [('Shakespeare', u'B-PER'), ('theatre', u'O'), ('was', u'O'), ('in', u'O'), 

        ('London', u'B-LOC'), ('.', u'O')]  

        >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) 

        [('UN', u'B-ORG'), ('headquarters', u'O'), ('are', u'O'), ('in', u'O'), 

        ('NY', u'B-LOC'), (',', u'O'), ('USA', u'B-LOC'), ('.', u'O')] 

    """ 

    def __init__(self, path, encoding='utf-8'): 

        super(NERTagger, self).__init__(path, ['ner'], encoding) 

 

    def batch_tag(self, sentences): 

        """ 

        Applies the tag method over a list of sentences. This method will return 

        for each sentence a list of tuples of (word, tag). 

        """ 

        tagged_sents = super(NERTagger, self).batch_tag(sentences) 

        for i in range(len(tagged_sents)): 

            for j in range(len(tagged_sents[i])): 

                annotations = tagged_sents[i][j] 

                tagged_sents[i][j] = (annotations['word'], annotations['ner']) 

        return tagged_sents 

 

 

class CHKTagger(SennaTagger): 

    """ 

    A chunker. 

 

    The input is: 

    - path to the directory that contains SENNA executables. 

    - (optionally) the encoding of the input data (default:utf-8) 

 

    Example: 

 

    .. doctest:: 

        :options: +SKIP 

 

        >>> from nltk.tag.senna import CHKTagger 

        >>> chktagger = CHKTagger('/usr/share/senna-v2.0') 

        >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) 

        [('What', u'B-NP'), ('is', u'B-VP'), ('the', u'B-NP'), ('airspeed', u'I-NP'), 

        ('of', u'B-PP'), ('an', u'B-NP'), ('unladen', u'I-NP'), ('swallow',u'I-NP'), 

        ('?', u'O')] 

    """ 

    def __init__(self, path, encoding='utf-8'): 

        super(CHKTagger, self).__init__(path, ['chk'], encoding) 

 

    def batch_tag(self, sentences): 

        """ 

        Applies the tag method over a list of sentences. This method will return 

        for each sentence a list of tuples of (word, tag). 

        """ 

        tagged_sents = super(CHKTagger, self).batch_tag(sentences) 

        for i in range(len(tagged_sents)): 

            for j in range(len(tagged_sents[i])): 

                annotations = tagged_sents[i][j] 

                tagged_sents[i][j] = (annotations['word'], annotations['chk']) 

        return tagged_sents