Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

# Natural Language Toolkit: Distance Metrics 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

#         Tom Lippincott <tom@cs.columbia.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

# 

 

""" 

Distance Metrics. 

 

Compute the distance between two items (usually strings). 

As metrics, they must satisfy the following three requirements: 

 

1. d(a, a) = 0 

2. d(a, b) >= 0 

3. d(a, c) <= d(a, b) + d(b, c) 

 

""" 

from __future__ import print_function 

 

def _edit_dist_init(len1, len2): 

    lev = [] 

    for i in range(len1): 

        lev.append([0] * len2)  # initialize 2-D array to zero 

    for i in range(len1): 

        lev[i][0] = i           # column 0: 0,1,2,3,4,... 

    for j in range(len2): 

        lev[0][j] = j           # row 0: 0,1,2,3,4,... 

    return lev 

 

def _edit_dist_step(lev, i, j, c1, c2): 

    a = lev[i-1][j  ] + 1            # skipping s1[i] 

    b = lev[i-1][j-1] + (c1 != c2)   # matching s1[i] with s2[j] 

    c = lev[i  ][j-1] + 1            # skipping s2[j] 

    lev[i][j] = min(a,b,c)           # pick the cheapest 

 

def edit_distance(s1, s2): 

    """ 

    Calculate the Levenshtein edit-distance between two strings. 

    The edit distance is the number of characters that need to be 

    substituted, inserted, or deleted, to transform s1 into s2.  For 

    example, transforming "rain" to "shine" requires three steps, 

    consisting of two substitutions and one insertion: 

    "rain" -> "sain" -> "shin" -> "shine".  These operations could have 

    been done in other orders, but at least three steps are needed. 

 

    :param s1, s2: The strings to be analysed 

    :type s1: str 

    :type s2: str 

    :rtype int 

    """ 

    # set up a 2-D array 

    len1 = len(s1); len2 = len(s2) 

    lev = _edit_dist_init(len1+1, len2+1) 

 

    # iterate over the array 

    for i in range(len1): 

        for j in range (len2): 

            _edit_dist_step(lev, i+1, j+1, s1[i], s2[j]) 

    return lev[len1][len2] 

 

 

def binary_distance(label1, label2): 

    """Simple equality test. 

 

    0.0 if the labels are identical, 1.0 if they are different. 

 

    >>> from nltk.metrics import binary_distance 

    >>> binary_distance(1,1) 

    0.0 

 

    >>> binary_distance(1,3) 

    1.0 

    """ 

 

    if label1 == label2: 

        return 0.0 

    else: 

        return 1.0 

 

 

def jaccard_distance(label1, label2): 

    """Distance metric comparing set-similarity. 

 

    """ 

    return (len(label1.union(label2)) - len(label1.intersection(label2)))/float(len(label1.union(label2))) 

 

 

def masi_distance(label1, label2): 

    """Distance metric that takes into account partial agreement when multiple 

    labels are assigned. 

 

    >>> from nltk.metrics import masi_distance 

    >>> masi_distance(set([1,2]),set([1,2,3,4])) 

    0.5 

 

    Passonneau 2005, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation. 

    """ 

 

    return 1 - float(len(label1.intersection(label2)))/float(max(len(label1),len(label2))) 

 

 

def interval_distance(label1,label2): 

    """Krippendorff'1 interval distance metric 

 

    >>> from nltk.metrics import interval_distance 

    >>> interval_distance(1,10) 

    81 

 

    Krippendorff 1980, Content Analysis: An Introduction to its Methodology 

    """ 

    try: 

        return pow(label1-label2,2) 

#        return pow(list(label1)[0]-list(label2)[0],2) 

    except: 

        print("non-numeric labels not supported with interval distance") 

 

 

def presence(label): 

    """Higher-order function to test presence of a given label 

 

    """ 

    return lambda x,y: 1.0*((label in x) == (label in y)) 

 

 

def fractional_presence(label): 

    return lambda x,y:abs((float(1.0/len(x)) - float(1.0/len(y))))*(label in x and label in y) or 0.0*(label not in x and label not in y) or abs((float(1.0/len(x))))*(label in x and label not in y) or ((float(1.0/len(y))))*(label not in x and label in y) 

 

 

def custom_distance(file): 

    data = {} 

    for l in open(file): 

        labelA, labelB, dist = l.strip().split("\t") 

        labelA = frozenset([labelA]) 

        labelB = frozenset([labelB]) 

        data[frozenset([labelA,labelB])] = float(dist) 

    return lambda x,y:data[frozenset([x,y])] 

 

 

def demo(): 

    s1 = "rain" 

    s2 = "shine" 

    print("Edit distance between '%s' and '%s':" % (s1,s2), edit_distance(s1, s2)) 

 

    s1 = set([1,2,3,4]) 

    s2 = set([3,4,5]) 

    print("s1:", s1) 

    print("s2:", s2) 

    print("Binary distance:", binary_distance(s1, s2)) 

    print("Jaccard distance:", jaccard_distance(s1, s2)) 

    print("MASI distance:", masi_distance(s1, s2)) 

 

if __name__ == '__main__': 

    demo()