Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

# Natural Language Toolkit: Windowdiff 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

########################################################################## 

# Windowdiff 

# Pevzner, L., and Hearst, M., A Critique and Improvement of 

#   an Evaluation Metric for Text Segmentation, 

# Computational Linguistics,, 28 (1), March 2002, pp. 19-36 

########################################################################## 

from __future__ import print_function 

 

def windowdiff(seg1, seg2, k, boundary="1"): 

    """ 

    Compute the windowdiff score for a pair of segmentations.  A segmentation is any sequence 

    over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used 

    to mark the edge of a segmentation. 

 

    >>> from nltk.metrics.windowdiff import windowdiff 

    >>> s1 = "00000010000000001000000" 

    >>> s2 = "00000001000000010000000" 

    >>> s3 = "00010000000000000001000" 

    >>> windowdiff(s1, s1, 3) 

    0 

    >>> windowdiff(s1, s2, 3) 

    4 

    >>> windowdiff(s2, s3, 3) 

    16 

 

    :param seg1: a segmentation 

    :type seg1: str or list 

    :param seg2: a segmentation 

    :type seg2: str or list 

    :param k: window width 

    :type k: int 

    :param boundary: boundary value 

    :type boundary: str or int or bool 

    :rtype: int 

    """ 

 

    if len(seg1) != len(seg2): 

        raise ValueError("Segmentations have unequal length") 

    wd = 0 

    for i in range(len(seg1) - k): 

        wd += abs(seg1[i:i+k+1].count(boundary) - seg2[i:i+k+1].count(boundary)) 

    return wd 

 

def demo(): 

    s1 = "00000010000000001000000" 

    s2 = "00000001000000010000000" 

    s3 = "00010000000000000001000" 

    print("s1:", s1) 

    print("s2:", s2) 

    print("s3:", s3) 

 

    print("windowdiff(s1, s1, 3) = ", windowdiff(s1, s1, 3)) 

    print("windowdiff(s1, s2, 3) = ", windowdiff(s1, s2, 3)) 

    print("windowdiff(s2, s3, 3) = ", windowdiff(s2, s3, 3))