Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

# Natural Language Toolkit: Lin's Thesaurus 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Dan Blanchard <dan.blanchard@gmail.com> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.txt 

from __future__ import print_function 

 

import re 

from collections import defaultdict 

from functools import reduce 

 

from nltk.corpus.reader import CorpusReader 

 

 

class LinThesaurusCorpusReader(CorpusReader): 

    """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """ 

 

    # Compiled regular expression for extracting the key from the first line of each 

    # thesaurus entry 

    _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+') 

 

    @staticmethod 

    def __defaultdict_factory(): 

        ''' Factory for creating defaultdict of defaultdict(dict)s ''' 

        return defaultdict(dict) 

 

    def __init__(self, root, badscore=0.0): 

        ''' 

        Initialize the thesaurus. 

 

        @param root: root directory containing thesaurus LISP files 

        @type root: C{string} 

        @param badscore: the score to give to words which do not appear in each other's sets of synonyms 

        @type badscore: C{float} 

        ''' 

 

        super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp') 

        self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory) 

        self._badscore = badscore 

        for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True): 

            with open(path) as lin_file: 

                first = True 

                for line in lin_file: 

                    line = line.strip() 

                    # Start of entry 

                    if first: 

                        key = LinThesaurusCorpusReader._key_re.sub(r'\1', line) 

                        first = False 

                    # End of entry 

                    elif line == '))': 

                        first = True 

                    # Lines with pairs of ngrams and scores 

                    else: 

                        split_line = line.split('\t') 

                        if len(split_line) == 2: 

                            ngram, score = split_line 

                            self._thesaurus[fileid][key][ngram.strip('"')] = score 

 

    def similarity(self, ngram1, ngram2, fileid=None): 

        ''' 

        Returns the similarity score for two ngrams. 

 

        @param ngram1: first ngram to compare 

        @type ngram1: C{string} 

        @param ngram2: second ngram to compare 

        @type ngram2: C{string} 

        @param fileid: thesaurus fileid to search in. If None, search all fileids. 

        @type fileid: C{string} 

        @return: If fileid is specified, just the score for the two ngrams; otherwise, 

                 list of tuples of fileids and scores. 

        ''' 

        # Entries don't contain themselves, so make sure similarity between item and itself is 1.0 

        if ngram1 == ngram2: 

            if fileid: 

                return 1.0 

            else: 

                return [(fid, 1.0) for fid in self._fileids] 

        else: 

            if fileid: 

                return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore 

            else: 

                return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1] 

                                  else self._badscore)) for fid in self._fileids] 

 

    def scored_synonyms(self, ngram, fileid=None): 

        ''' 

        Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram 

 

        @param ngram: ngram to lookup 

        @type ngram: C{string} 

        @param fileid: thesaurus fileid to search in. If None, search all fileids. 

        @type fileid: C{string} 

        @return: If fileid is specified, list of tuples of scores and synonyms; otherwise, 

                 list of tuples of fileids and lists, where inner lists consist of tuples of 

                 scores and synonyms. 

        ''' 

        if fileid: 

            return self._thesaurus[fileid][ngram].items() 

        else: 

            return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids] 

 

    def synonyms(self, ngram, fileid=None): 

        ''' 

        Returns a list of synonyms for the current ngram. 

 

        @param ngram: ngram to lookup 

        @type ngram: C{string} 

        @param fileid: thesaurus fileid to search in. If None, search all fileids. 

        @type fileid: C{string} 

        @return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and 

                 lists, where inner lists contain synonyms. 

        ''' 

        if fileid: 

            return self._thesaurus[fileid][ngram].keys() 

        else: 

            return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids] 

 

    def __contains__(self, ngram): 

        ''' 

        Determines whether or not the given ngram is in the thesaurus. 

 

        @param ngram: ngram to lookup 

        @type ngram: C{string} 

        @return: whether the given ngram is in the thesaurus. 

        ''' 

        return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False) 

 

 

###################################################################### 

# Demo 

###################################################################### 

 

def demo(): 

    from nltk.corpus import lin_thesaurus as thes 

 

    word1 = "business" 

    word2 = "enterprise" 

    print("Getting synonyms for " + word1) 

    print(thes.synonyms(word1)) 

 

    print("Getting scored synonyms for " + word1) 

    print(thes.synonyms(word1)) 

 

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1) 

    print(thes.synonyms(word1, fileid="simN.lsp")) 

 

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1) 

    print(thes.synonyms(word1, fileid="simN.lsp")) 

 

    print("Similarity score for %s and %s:" % (word1, word2)) 

    print(thes.similarity(word1, word2)) 

 

 

if __name__ == '__main__': 

    demo()