Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

# Natural Language Toolkit: Combinatory Categorial Grammar 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Graeme Gange <ggange@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

import re 

from collections import defaultdict 

 

from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory 

 

#------------ 

# Regular expressions used for parsing components of the lexicon 

#------------ 

 

# Parses a primitive category and subscripts 

rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''') 

 

# Separates the next primitive category from the remainder of the 

# string 

reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''') 

 

# Separates the next application operator from the remainder 

reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''') 

 

# Parses the definition of the category of either a word or a family 

reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''') 

 

# Strips comments from a line 

reComm = re.compile('''([^#]*)(?:#.*)?''') 

 

#---------- 

# Lexicons 

#---------- 

class CCGLexicon(object): 

    ''' 

    Class representing a lexicon for CCG grammars. 

    primitives - The list of primitive categories for the lexicon 

    families - Families of categories 

    entries - A mapping of words to possible categories 

    ''' 

    def __init__(self,start,primitives,families,entries): 

        self._start = PrimitiveCategory(start) 

        self._primitives = primitives 

        self._families = families 

        self._entries = entries 

 

    # Returns all the possible categories for a word 

    def categories(self,word): 

        return self._entries[word] 

 

    # Returns the target category for the parser 

    def start(self): 

        return self._start 

 

    # String representation of the lexicon 

    # Used for debugging 

    def __str__(self): 

        st = "" 

        first = True 

        for ident in self._entries: 

            if not first: 

                st = st + "\n" 

            st = st + ident + " => " 

 

            first = True 

            for cat in self._entries[ident]: 

                if not first: 

                    st = st + " | " 

                else: 

                    first = False 

                st = st + str(cat) 

        return st 

 

 

#----------- 

# Parsing lexicons 

#----------- 

 

# Separates the contents matching the first set of brackets 

# from the rest of the input. 

def matchBrackets(string): 

    rest = string[1:] 

    inside = "(" 

 

    while rest != "" and not rest.startswith(')'): 

        if rest.startswith('('): 

            (part,rest) = matchBrackets(rest) 

            inside = inside + part 

        else: 

            inside = inside + rest[0] 

            rest = rest[1:] 

    if rest.startswith(')'): 

        return (inside + ')',rest[1:]) 

    raise AssertionError('Unmatched bracket in string \'' + string + '\'') 

 

# Separates the string for the next portion of the category 

# from the rest of the string 

def nextCategory(string): 

    if string.startswith('('): 

        return matchBrackets(string) 

    return reNextPrim.match(string).groups() 

 

# Parses an application operator 

def parseApplication(app): 

    return Direction(app[0],app[1:]) 

 

# Parses the subscripts for a primitive category 

def parseSubscripts(subscr): 

    if subscr: 

        return subscr[1:-1].split(',') 

    return [] 

 

# Parse a primitive category 

def parsePrimitiveCategory(chunks,primitives,families,var): 

    # If the primitive is the special category 'var', 

    # replace it with the correct CCGVar 

    if chunks[0] == "var": 

        if chunks[1] is None: 

            if var is None: 

                var = CCGVar() 

            return (var,var) 

 

    catstr = chunks[0] 

    if catstr in families: 

        (cat, cvar) = families[catstr] 

        if var is None: 

            var = cvar 

        else: 

            cat = cat.substitute([(cvar,var)]) 

        return (cat,var) 

 

    if catstr in primitives: 

        subscrs = parseSubscripts(chunks[1]) 

        return (PrimitiveCategory(catstr,subscrs),var) 

    raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.') 

 

# parseCategory drops the 'var' from the tuple 

def parseCategory(line,primitives,families): 

    return augParseCategory(line,primitives,families)[0] 

 

# Parses a string representing a category, and returns 

# a tuple with (possibly) the CCG variable for the category 

def augParseCategory(line,primitives,families,var = None): 

    (str,rest) = nextCategory(line) 

 

    if str.startswith('('): 

        (res,var) = augParseCategory(str[1:-1],primitives,families,var) 

 

    else: 

#        print rePrim.match(str).groups() 

        (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) 

 

    while rest != "": 

        app = reApp.match(rest).groups() 

        dir = parseApplication(app[0:3]) 

        rest = app[3] 

 

        (str,rest) = nextCategory(rest) 

        if str.startswith('('): 

            (arg,var) = augParseCategory(str[1:-1],primitives,families,var) 

        else: 

            (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) 

        res = FunctionalCategory(res,arg,dir) 

 

    return (res,var) 

 

# Takes an input string, and converts it into a lexicon for CCGs. 

def parseLexicon(lex_str): 

    primitives = [] 

    families = {} 

    entries = defaultdict(list) 

    for line in lex_str.splitlines(): 

        # Strip comments and leading/trailing whitespace. 

        line = reComm.match(line).groups()[0].strip() 

        if line == "": 

            continue 

 

        if line.startswith(':-'): 

            # A line of primitive categories. 

            # The first line is the target category 

            # ie, :- S, N, NP, VP 

            primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ] 

        else: 

            # Either a family definition, or a word definition 

            (ident, sep, catstr) = reLex.match(line).groups() 

            (cat,var) = augParseCategory(catstr,primitives,families) 

            if sep == '::': 

                # Family definition 

                # ie, Det :: NP/N 

                families[ident] = (cat,var) 

            else: 

                # Word definition 

                # ie, which => (N\N)/(S/NP) 

                entries[ident].append(cat) 

    return CCGLexicon(primitives[0],primitives,families,entries) 

 

 

openccg_tinytiny = parseLexicon(''' 

    # Rather minimal lexicon based on the openccg `tinytiny' grammar. 

    # Only incorporates a subset of the morphological subcategories, however. 

    :- S,NP,N                    # Primitive categories 

    Det :: NP/N                  # Determiners 

    Pro :: NP 

    IntransVsg :: S\\NP[sg]    # Tensed intransitive verbs (singular) 

    IntransVpl :: S\\NP[pl]    # Plural 

    TransVsg :: S\\NP[sg]/NP   # Tensed transitive verbs (singular) 

    TransVpl :: S\\NP[pl]/NP   # Plural 

 

    the => NP[sg]/N[sg] 

    the => NP[pl]/N[pl] 

 

    I => Pro 

    me => Pro 

    we => Pro 

    us => Pro 

 

    book => N[sg] 

    books => N[pl] 

 

    peach => N[sg] 

    peaches => N[pl] 

 

    policeman => N[sg] 

    policemen => N[pl] 

 

    boy => N[sg] 

    boys => N[pl] 

 

    sleep => IntransVsg 

    sleep => IntransVpl 

 

    eat => IntransVpl 

    eat => TransVpl 

    eats => IntransVsg 

    eats => TransVsg 

 

    see => TransVpl 

    sees => TransVsg 

    ''')