Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

# Natural Language Toolkit: RTE Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author:  Ewan Klein <ewan@inf.ed.ac.uk> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. 

 

The files were taken from the RTE1, RTE2 and RTE3 datasets and the files 

were regularized. 

 

Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the 

gold standard annotated files. 

 

Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following 

example is taken from RTE3:: 

 

<pair id="1" entailment="YES" task="IE" length="short" > 

 

    <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill, 

    Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known 

    company Baikalfinansgroup which was later bought by the Russian 

    state-owned oil company Rosneft .</t> 

 

   <h>Baikalfinansgroup was sold to Rosneft.</h> 

</pair> 

 

In order to provide globally unique IDs for each pair, a new attribute 

``challenge`` has been added to the root element ``entailment-corpus`` of each 

file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the 

challenge number and 'n' is the pair ID. 

""" 

 

from nltk import compat 

from .util import * 

from .api import * 

from .xmldocs import * 

 

 

def norm(value_string): 

    """ 

    Normalize the string value in an RTE pair's ``value`` or ``entailment`` 

    attribute as an integer (1, 0). 

 

    :param value_string: the label used to classify a text/hypothesis pair 

    :type value_string: str 

    :rtype: int 

    """ 

 

    valdict = {"TRUE": 1, 

                     "FALSE": 0, 

                     "YES": 1, 

                     "NO": 0} 

    return valdict[value_string.upper()] 

 

class RTEPair: 

    """ 

    Container for RTE text-hypothesis pairs. 

 

    The entailment relation is signalled by the ``value`` attribute in RTE1, and by 

    ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment`` 

    attribute of this class. 

    """ 

    def __init__(self, pair, challenge=None, id=None, text=None, hyp=None, 

             value=None, task=None, length=None): 

        """ 

        :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) 

        :param id: identifier for the pair 

        :param text: the text component of the pair 

        :param hyp: the hypothesis component of the pair 

        :param value: classification label for the pair 

        :param task: attribute for the particular NLP task that the data was drawn from 

        :param length: attribute for the length of the text of the pair 

        """ 

        self.challenge =  challenge 

        self.id = pair.attrib["id"] 

        self.gid = "%s-%s" % (self.challenge, self.id) 

        self.text = pair[0].text 

        self.hyp = pair[1].text 

 

        if "value" in pair.attrib: 

            self.value = norm(pair.attrib["value"]) 

        elif "entailment" in pair.attrib: 

            self.value = norm(pair.attrib["entailment"]) 

        else: 

            self.value = value 

        if "task" in pair.attrib: 

            self.task = pair.attrib["task"] 

        else: 

            self.task = task 

        if "length" in pair.attrib: 

            self.length = pair.attrib["length"] 

        else: 

            self.length = length 

 

    def __repr__(self): 

        if self.challenge: 

            return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id) 

        else: 

            return '<RTEPair: id=%s>' % self.id 

 

 

class RTECorpusReader(XMLCorpusReader): 

    """ 

    Corpus reader for corpora in RTE challenges. 

 

    This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected 

    structure of input documents. 

    """ 

 

    def _read_etree(self, doc): 

        """ 

        Map the XML input into an RTEPair. 

 

        This uses the ``getiterator()`` method from the ElementTree package to 

        find all the ``<pair>`` elements. 

 

        :param doc: a parsed XML document 

        :rtype: list(RTEPair) 

        """ 

        try: 

            challenge = doc.attrib['challenge'] 

        except KeyError: 

            challenge = None 

        return [RTEPair(pair, challenge=challenge) 

                for pair in doc.getiterator("pair")] 

 

 

    def pairs(self, fileids): 

        """ 

        Build a list of RTEPairs from a RTE corpus. 

 

        :param fileids: a list of RTE corpus fileids 

        :type: list 

        :rtype: list(RTEPair) 

        """ 

        if isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])