Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

# Natural Language Toolkit: Tagger Utilities 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

def str2tuple(s, sep='/'): 

    """ 

    Given the string representation of a tagged token, return the 

    corresponding tuple representation.  The rightmost occurrence of 

    *sep* in *s* will be used to divide *s* into a word string and 

    a tag string.  If *sep* does not occur in *s*, return (s, None). 

 

        >>> from nltk.tag.util import str2tuple 

        >>> str2tuple('fly/NN') 

        ('fly', 'NN') 

 

    :type s: str 

    :param s: The string representation of a tagged token. 

    :type sep: str 

    :param sep: The separator string used to separate word strings 

        from tags. 

    """ 

    loc = s.rfind(sep) 

    if loc >= 0: 

        return (s[:loc], s[loc+len(sep):].upper()) 

    else: 

        return (s, None) 

 

def tuple2str(tagged_token, sep='/'): 

    """ 

    Given the tuple representation of a tagged token, return the 

    corresponding string representation.  This representation is 

    formed by concatenating the token's word string, followed by the 

    separator, followed by the token's tag.  (If the tag is None, 

    then just return the bare word string.) 

 

        >>> from nltk.tag.util import tuple2str 

        >>> tagged_token = ('fly', 'NN') 

        >>> tuple2str(tagged_token) 

        'fly/NN' 

 

    :type tagged_token: tuple(str, str) 

    :param tagged_token: The tuple representation of a tagged token. 

    :type sep: str 

    :param sep: The separator string used to separate word strings 

        from tags. 

    """ 

    word, tag = tagged_token 

    if tag is None: 

        return word 

    else: 

        assert sep not in tag, 'tag may not contain sep!' 

        return '%s%s%s' % (word, sep, tag) 

 

def untag(tagged_sentence): 

    """ 

    Given a tagged sentence, return an untagged version of that 

    sentence.  I.e., return a list containing the first element 

    of each tuple in *tagged_sentence*. 

 

        >>> from nltk.tag.util import untag 

        >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) 

        ['John', 'saw', 'Mary'] 

 

    """ 

    return [w for (w, t) in tagged_sentence] 

 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)