Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

# Natural Language Toolkit: Agreement Metrics 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Tom Lippincott <tom@cs.columbia.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

# 

 

""" 

Implementations of inter-annotator agreement coefficients surveyed by Artstein 

and Poesio (2007), Inter-Coder Agreement for Computational Linguistics. 

 

An agreement coefficient calculates the amount that annotators agreed on label 

assignments beyond what is expected by chance. 

 

In defining the AnnotationTask class, we use naming conventions similar to the 

paper's terminology.  There are three types of objects in an annotation task: 

 

    the coders (variables "c" and "C") 

    the items to be annotated (variables "i" and "I") 

    the potential categories to be assigned (variables "k" and "K") 

 

Additionally, it is often the case that we don't want to treat two different 

labels as complete disagreement, and so the AnnotationTask constructor can also 

take a distance metric as a final argument.  Distance metrics are simply 

functions that take two arguments, and return a value between 0.0 and 1.0 

indicating the distance between them.  If not supplied, the default is binary 

comparison between the arguments. 

 

The simplest way to initialize an AnnotationTask is with a list of equal-length 

lists, each containing a coder's assignments for all objects in the task: 

 

    task = AnnotationTask([],[],[]) 

 

Alpha (Krippendorff 1980) 

Kappa (Cohen 1960) 

S (Bennet, Albert and Goldstein 1954) 

Pi (Scott 1955) 

 

 

TODO: Describe handling of multiple coders and missing data 

 

Expected results from the Artstein and Poesio survey paper: 

 

.. doctest:: 

    :options: +SKIP 

 

    >>> from nltk.metrics.agreement import AnnotationTask 

    >>> import os.path 

    >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))]) 

    >>> t.avg_Ao() 

    0.88 

    >>> t.pi() 

    0.79953224189776151 

    >>> t.S() 

    0.81999999999999984 

 

""" 

from __future__ import print_function 

 

import logging 

from itertools import groupby 

from operator import itemgetter 

 

from nltk.probability import FreqDist, ConditionalFreqDist 

from nltk.internals import deprecated 

from nltk import compat 

 

from nltk.metrics.distance import binary_distance 

 

log = logging.getLogger(__file__) 

 

class AnnotationTask(object): 

    """Represents an annotation task, i.e. people assign labels to items. 

 

    Notation tries to match notation in Artstein and Poesio (2007). 

 

    In general, coders and items can be represented as any hashable object. 

    Integers, for example, are fine, though strings are more readable. 

    Labels must support the distance functions applied to them, so e.g. 

    a string-edit-distance makes no sense if your labels are integers, 

    whereas interval distance needs numeric values.  A notable case of this 

    is the MASI metric, which requires Python sets. 

    """ 

 

    def __init__(self, data=None, distance=binary_distance): 

        """Initialize an empty annotation task. 

 

        """ 

        self.distance = distance 

        self.I = set() 

        self.K = set() 

        self.C = set() 

        self.data = [] 

        if data is not None: 

            self.load_array(data) 

 

    def __str__(self): 

        return "\r\n".join(map(lambda x:"%s\t%s\t%s" % 

                               (x['coder'], x['item'].replace('_', "\t"), 

                                ",".join(x['labels'])), self.data)) 

 

    def load_array(self, array): 

        """Load the results of annotation. 

 

        The argument is a list of 3-tuples, each representing a coder's labeling of an item: 

            (coder,item,label) 

        """ 

        for coder, item, labels in array: 

            self.C.add(coder) 

            self.K.add(labels) 

            self.I.add(item) 

            self.data.append({'coder':coder, 'labels':labels, 'item':item}) 

 

    def agr(self, cA, cB, i, data=None): 

        """Agreement between two coders on a given item 

 

        """ 

        data = data or self.data 

        kA = next((x for x in data if x['coder']==cA and x['item']==i)) 

        kB = next((x for x in data if x['coder']==cB and x['item']==i)) 

        ret = 1.0 - float(self.distance(kA['labels'], kB['labels'])) 

        log.debug("Observed agreement between %s and %s on %s: %f", 

                      cA, cB, i, ret) 

        log.debug("Distance between \"%r\" and \"%r\": %f", 

                      kA['labels'], kB['labels'], 1.0 - ret) 

        return ret 

 

    def Nk(self, k): 

        return float(sum(1 for x in self.data if x['labels'] == k)) 

 

    def Nik(self, i, k): 

        return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k)) 

 

    def Nck(self, c, k): 

        return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k)) 

 

    @deprecated('Use Nk, Nik or Nck instead') 

    def N(self, k=None, i=None, c=None): 

        """Implements the "n-notation" used in Artstein and Poesio (2007) 

 

        """ 

        if k is not None and i is None and c is None: 

            ret = self.Nk(k) 

        elif k is not None and i is not None and c is None: 

            ret = self.Nik(i, k) 

        elif k is not None and c is not None and i is None: 

            ret = self.Nck(c, k) 

        else: 

            raise ValueError("You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c)) 

        log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret) 

        return ret 

 

    def _grouped_data(self, field, data=None): 

        data = data or self.data 

        return groupby(sorted(data, key=itemgetter(field)), itemgetter(field)) 

 

    def Ao(self, cA, cB): 

        """Observed agreement between two coders on all items. 

 

        """ 

        data = self._grouped_data('item', (x for x in self.data if x['coder'] in (cA, cB))) 

        ret = float(sum(self.agr(cA, cB, item, item_data) for item, item_data in data)) / float(len(self.I)) 

        log.debug("Observed agreement between %s and %s: %f", cA, cB, ret) 

        return ret 

 

    def _pairwise_average(self, function): 

        """ 

        Calculates the average of function results for each coder pair 

        """ 

        total = 0 

        n = 0 

        s = self.C.copy() 

        for cA in self.C: 

            s.remove(cA) 

            for cB in s: 

                total += function(cA, cB) 

                n += 1 

        ret = total / n 

        return ret 

 

    def avg_Ao(self): 

        """Average observed agreement across all coders and items. 

 

        """ 

        ret = self._pairwise_average(self.Ao) 

        log.debug("Average observed agreement: %f", ret) 

        return ret 

 

    def Do_alpha(self): 

        """The observed disagreement for the alpha coefficient. 

 

        The alpha coefficient, unlike the other metrics, uses this rather than 

        observed agreement. 

        """ 

        total = 0.0 

        for i, itemdata in self._grouped_data('item'): 

            label_freqs = FreqDist(x['labels'] for x in itemdata) 

 

            for j, nj in compat.iteritems(label_freqs): 

                for l, nl in compat.iteritems(label_freqs): 

                    total += float(nj * nl) * self.distance(l, j) 

        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total 

        log.debug("Observed disagreement: %f", ret) 

        return ret 

 

    def Do_Kw_pairwise(self,cA,cB,max_distance=1.0): 

        """The observed disagreement for the weighted kappa coefficient. 

 

        """ 

        total = 0.0 

        data = (x for x in self.data if x['coder'] in (cA, cB)) 

        for i, itemdata in self._grouped_data('item', data): 

            # we should have two items; distance doesn't care which comes first 

            total += self.distance(itemdata.next()['labels'], 

                    itemdata.next()['labels']) 

 

        ret = total / (len(self.I) * max_distance) 

        log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret) 

        return ret 

 

    def Do_Kw(self, max_distance=1.0): 

        """Averaged over all labelers 

 

        """ 

        ret = self._pairwise_average(lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)) 

        log.debug("Observed disagreement: %f", ret) 

        return ret 

 

    # Agreement Coefficients 

    def S(self): 

        """Bennett, Albert and Goldstein 1954 

 

        """ 

        Ae = 1.0 / float(len(self.K)) 

        ret = (self.avg_Ao() - Ae) / (1.0 - Ae) 

        return ret 

 

    def pi(self): 

        """Scott 1955; here, multi-pi. 

        Equivalent to K from Siegel and Castellan (1988). 

 

        """ 

        total = 0.0 

        label_freqs = FreqDist(x['labels'] for x in self.data) 

        for k, f in compat.iteritems(label_freqs): 

            total += f ** 2 

        Ae = total / float((len(self.I) * len(self.C)) ** 2) 

        return (self.avg_Ao() - Ae) / (1 - Ae) 

 

    def Ae_kappa(self, cA, cB): 

        Ae = 0.0 

        nitems = float(len(self.I)) 

        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) 

        for k in label_freqs.conditions(): 

            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) 

        return Ae 

 

    def kappa_pairwise(self, cA, cB): 

        """ 

 

        """ 

        Ae = self.Ae_kappa(cA, cB) 

        ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae) 

        log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae) 

        return ret 

 

    def kappa(self): 

        """Cohen 1960 

        Averages naively over kappas for each coder pair. 

 

        """ 

        return self._pairwise_average(self.kappa_pairwise) 

 

    def multi_kappa(self): 

        """Davies and Fleiss 1982 

        Averages over observed and expected agreements for each coder pair. 

 

        """ 

        Ae = self._pairwise_average(self.Ae_kappa) 

        return (self.avg_Ao() - Ae) / (1.0 - Ae) 

 

    def alpha(self): 

        """Krippendorff 1980 

 

        """ 

        De = 0.0 

 

        label_freqs = FreqDist(x['labels'] for x in self.data) 

        for j in self.K: 

            nj = label_freqs[j] 

            for l in self.K: 

                De += float(nj * label_freqs[l]) * self.distance(j, l) 

        De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De 

        log.debug("Expected disagreement: %f", De) 

        ret = 1.0 - (self.Do_alpha() / De) 

        return ret 

 

    def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): 

        """Cohen 1968 

 

        """ 

        total = 0.0 

        label_freqs = ConditionalFreqDist((x['coder'], x['labels']) 

                for x in self.data 

                if x['coder'] in (cA, cB)) 

        for j in self.K: 

            for l in self.K: 

                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) 

        De = total / (max_distance * pow(len(self.I), 2)) 

        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) 

        Do = self.Do_Kw_pairwise(cA, cB) 

        ret = 1.0 - (Do / De) 

        return ret 

 

    def weighted_kappa(self, max_distance=1.0): 

        """Cohen 1968 

 

        """ 

        return self._pairwise_average(lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)) 

 

 

if __name__ == '__main__': 

 

    import re 

    import optparse 

    from . import distance 

 

    # process command-line arguments 

    parser = optparse.OptionParser() 

    parser.add_option("-d", "--distance", dest="distance", default="binary_distance", 

                      help="distance metric to use") 

    parser.add_option("-a", "--agreement", dest="agreement", default="kappa", 

                      help="agreement coefficient to calculate") 

    parser.add_option("-e", "--exclude", dest="exclude", action="append", 

                      default=[], help="coder names to exclude (may be specified multiple times)") 

    parser.add_option("-i", "--include", dest="include", action="append", default=[], 

                      help="coder names to include, same format as exclude") 

    parser.add_option("-f", "--file", dest="file", 

                      help="file to read labelings from, each line with three columns: 'labeler item labels'") 

    parser.add_option("-v", "--verbose", dest="verbose", default='0', 

                      help="how much debugging to print on stderr (0-4)") 

    parser.add_option("-c", "--columnsep", dest="columnsep", default="\t", 

                      help="char/string that separates the three columns in the file, defaults to tab") 

    parser.add_option("-l", "--labelsep", dest="labelsep", default=",", 

                      help="char/string that separates labels (if labelers can assign more than one), defaults to comma") 

    parser.add_option("-p", "--presence", dest="presence", default=None, 

                      help="convert each labeling into 1 or 0, based on presence of LABEL") 

    parser.add_option("-T", "--thorough", dest="thorough", default=False, action="store_true", 

                      help="calculate agreement for every subset of the annotators") 

    (options, remainder) = parser.parse_args() 

 

    if not options.file: 

        parser.print_help() 

        exit() 

 

    logging.basicConfig(level=50 - 10 * int(options.verbose)) 

 

    # read in data from the specified file 

    data = [] 

    for l in open(options.file): 

        toks = l.split(options.columnsep) 

        coder, object, labels = toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep)) 

        if ((options.include == options.exclude) or 

            (len(options.include) > 0 and coder in options.include) or 

            (len(options.exclude) > 0 and coder not in options.exclude)): 

            data.append((coder, object, labels)) 

 

    if options.presence: 

        task = AnnotationTask(data, getattr(distance, options.distance)(options.presence)) 

    else: 

        task = AnnotationTask(data, getattr(distance, options.distance)) 

 

    if options.thorough: 

        pass 

    else: 

        print(getattr(task, options.agreement)()) 

 

    logging.shutdown()