Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

# Natural Language Toolkit: Text Segmentation Metrics 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

#         David Doukhan <david.doukhan@gmail.com> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

 

 

""" 

Text Segmentation Metrics 

 

1. Windowdiff 

 

Pevzner, L., and Hearst, M., A Critique and Improvement of 

  an Evaluation Metric for Text Segmentation, 

Computational Linguistics 28, 19-36 

 

 

2. Generalized Hamming Distance 

 

Bookstein A., Kulyukin V.A., Raita T. 

Generalized Hamming Distance 

Information Retrieval 5, 2002, pp 353-375 

 

Baseline implementation in C++ 

http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html 

 

Study describing benefits of Generalized Hamming Distance Versus 

WindowDiff for evaluating text segmentation tasks 

Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ? 

TALN 2009 

 

 

3. Pk text segmentation metric 

 

Beeferman D., Berger A., Lafferty J. (1999) 

Statistical Models for Text Segmentation 

Machine Learning, 34, 177-210 

""" 

 

import numpy as np 

from nltk.compat import xrange 

 

def windowdiff(seg1, seg2, k, boundary="1"): 

    """ 

    Compute the windowdiff score for a pair of segmentations.  A 

    segmentation is any sequence over a vocabulary of two items 

    (e.g. "0", "1"), where the specified boundary value is used to 

    mark the edge of a segmentation. 

 

    >>> s1 = "00000010000000001000000" 

    >>> s2 = "00000001000000010000000" 

    >>> s3 = "00010000000000000001000" 

    >>> windowdiff(s1, s1, 3) 

    0 

    >>> windowdiff(s1, s2, 3) 

    4 

    >>> windowdiff(s2, s3, 3) 

    16 

 

    :param seg1: a segmentation 

    :type seg1: str or list 

    :param seg2: a segmentation 

    :type seg2: str or list 

    :param k: window width 

    :type k: int 

    :param boundary: boundary value 

    :type boundary: str or int or bool 

    :rtype: int 

    """ 

 

    if len(seg1) != len(seg2): 

        raise ValueError("Segmentations have unequal length") 

    wd = 0 

    for i in range(len(seg1) - k): 

        wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary)) 

    return wd 

 

 

 

# Generalized Hamming Distance 

 

def _init_mat(nrows, ncols, ins_cost, del_cost): 

    mat = np.empty((nrows, ncols)) 

    mat[0, :] = ins_cost * np.arange(ncols) 

    mat[:, 0] = del_cost * np.arange(nrows) 

    return mat 

 

 

def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): 

    for i, rowi in enumerate(rowv): 

        for j, colj in enumerate(colv): 

            shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] 

            if rowi == colj: 

                # boundaries are at the same location, no transformation required 

                tcost = mat[i, j] 

            elif rowi > colj: 

                # boundary match through a deletion 

                tcost = del_cost + mat[i, j + 1] 

            else: 

                # boundary match through an insertion 

                tcost = ins_cost + mat[i + 1, j] 

            mat[i + 1, j + 1] = min(tcost, shift_cost) 

 

 

def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'): 

    """ 

    Compute the Generalized Hamming Distance for a reference and a hypothetical 

    segmentation, corresponding to the cost related to the transformation 

    of the hypothetical segmentation into the reference segmentation 

    through boundary insertion, deletion and shift operations. 

 

    A segmentation is any sequence over a vocabulary of two items 

    (e.g. "0", "1"), where the specified boundary value is used to 

    mark the edge of a segmentation. 

 

    Recommended parameter values are a shift_cost_coeff of 2. 

    Associated with a ins_cost, and del_cost equal to the mean segment 

    length in the reference segmentation. 

 

    >>> # Same examples as Kulyukin C++ implementation 

    >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) 

    0.5 

    >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) 

    2.0 

    >>> ghd('011', '110', 1.0, 1.0, 0.5) 

    1.0 

    >>> ghd('1', '0', 1.0, 1.0, 0.5) 

    1.0 

    >>> ghd('111', '000', 1.0, 1.0, 0.5) 

    3.0 

    >>> ghd('000', '111', 1.0, 2.0, 0.5) 

    6.0 

 

    :param ref: the reference segmentation 

    :type ref: str or list 

    :param hyp: the hypothetical segmentation 

    :type hyp: str or list 

    :param ins_cost: insertion cost 

    :type ins_cost: float 

    :param del_cost: deletion cost 

    :type del_cost: float 

    :param shift_cost_coeff: constant used to compute the cost of a shift. 

    shift cost = shift_cost_coeff * |i - j| where i and j are 

    the positions indicating the shift 

    :type shift_cost_coeff: float 

    :param boundary: boundary value 

    :type boundary: str or int or bool 

    :rtype: float 

    """ 

 

    ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] 

    hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] 

 

    nref_bound = len(ref_idx) 

    nhyp_bound = len(hyp_idx) 

 

    if nref_bound == 0 and nhyp_bound == 0: 

        return 0.0 

    elif nref_bound > 0 and nhyp_bound == 0: 

        return nref_bound * ins_cost 

    elif nref_bound == 0 and nhyp_bound > 0: 

        return nhyp_bound * del_cost 

 

    mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) 

    _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) 

    return mat[-1, -1] 

 

 

# Beeferman's Pk text segmentation evaluation metric 

 

def pk(ref, hyp, k=None, boundary='1'): 

    """ 

    Compute the Pk metric for a pair of segmentations A segmentation 

    is any sequence over a vocabulary of two items (e.g. "0", "1"), 

    where the specified boundary value is used to mark the edge of a 

    segmentation. 

 

    >>> s1 = "00000010000000001000000" 

    >>> s2 = "00000001000000010000000" 

    >>> s3 = "00010000000000000001000" 

    >>> pk(s1, s1, 3) 

    0.0 

    >>> pk(s1, s2, 3) 

    0.095238... 

    >>> pk(s2, s3, 3) 

    0.190476... 

 

    :param ref: the reference segmentation 

    :type ref: str or list 

    :param hyp: the segmentation to evaluate 

    :type hyp: str or list 

    :param k: window size, if None, set to half of the average reference segment length 

    :type boundary: str or int or bool 

    :param boundary: boundary value 

    :type boundary: str or int or bool 

    :rtype: float 

    """ 

 

    if k is None: 

        k = int(round(len(ref) / (ref.count(boundary) * 2.))) 

 

    n_considered_seg = len(ref) - k + 1 

    n_same_ref = 0.0 

    n_false_alarm = 0.0 

    n_miss = 0.0 

 

    for i in xrange(n_considered_seg): 

        bsame_ref_seg = False 

        bsame_hyp_seg = False 

 

        if boundary not in ref[(i+1):(i+k)]: 

            n_same_ref += 1.0 

            bsame_ref_seg = True 

        if boundary not in hyp[(i+1):(i+k)]: 

            bsame_hyp_seg = True 

 

        if bsame_hyp_seg and not bsame_ref_seg: 

            n_miss += 1 

        if bsame_ref_seg and not bsame_hyp_seg: 

            n_false_alarm += 1 

 

    prob_same_ref = n_same_ref / n_considered_seg 

    prob_diff_ref = 1 - prob_same_ref 

    prob_miss = n_miss / n_considered_seg 

    prob_false_alarm = n_false_alarm / n_considered_seg 

 

    return prob_miss * prob_diff_ref + prob_false_alarm * prob_same_ref 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)