422 lines
16 KiB
Python
422 lines
16 KiB
Python
|
# Natural Language Toolkit: Agreement Metrics
|
||
|
#
|
||
|
# Copyright (C) 2001-2018 NLTK Project
|
||
|
# Author: Tom Lippincott <tom@cs.columbia.edu>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
#
|
||
|
|
||
|
"""
|
||
|
Implementations of inter-annotator agreement coefficients surveyed by Artstein
|
||
|
and Poesio (2007), Inter-Coder Agreement for Computational Linguistics.
|
||
|
|
||
|
An agreement coefficient calculates the amount that annotators agreed on label
|
||
|
assignments beyond what is expected by chance.
|
||
|
|
||
|
In defining the AnnotationTask class, we use naming conventions similar to the
|
||
|
paper's terminology. There are three types of objects in an annotation task:
|
||
|
|
||
|
the coders (variables "c" and "C")
|
||
|
the items to be annotated (variables "i" and "I")
|
||
|
the potential categories to be assigned (variables "k" and "K")
|
||
|
|
||
|
Additionally, it is often the case that we don't want to treat two different
|
||
|
labels as complete disagreement, and so the AnnotationTask constructor can also
|
||
|
take a distance metric as a final argument. Distance metrics are simply
|
||
|
functions that take two arguments, and return a value between 0.0 and 1.0
|
||
|
indicating the distance between them. If not supplied, the default is binary
|
||
|
comparison between the arguments.
|
||
|
|
||
|
The simplest way to initialize an AnnotationTask is with a list of triples,
|
||
|
each containing a coder's assignment for one object in the task:
|
||
|
|
||
|
task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...])
|
||
|
|
||
|
Note that the data list needs to contain the same number of triples for each
|
||
|
individual coder, containing category values for the same set of items.
|
||
|
|
||
|
Alpha (Krippendorff 1980)
|
||
|
Kappa (Cohen 1960)
|
||
|
S (Bennet, Albert and Goldstein 1954)
|
||
|
Pi (Scott 1955)
|
||
|
|
||
|
|
||
|
TODO: Describe handling of multiple coders and missing data
|
||
|
|
||
|
Expected results from the Artstein and Poesio survey paper:
|
||
|
|
||
|
>>> from nltk.metrics.agreement import AnnotationTask
|
||
|
>>> import os.path
|
||
|
>>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))])
|
||
|
>>> t.avg_Ao()
|
||
|
0.88
|
||
|
>>> t.pi()
|
||
|
0.7995322418977615...
|
||
|
>>> t.S()
|
||
|
0.8199999999999998...
|
||
|
|
||
|
This would have returned a wrong value (0.0) in @785fb79 as coders are in
|
||
|
the wrong order. Subsequently, all values for pi(), S(), and kappa() would
|
||
|
have been wrong as they are computed with avg_Ao().
|
||
|
>>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')])
|
||
|
>>> t2.avg_Ao()
|
||
|
1.0
|
||
|
|
||
|
The following, of course, also works.
|
||
|
>>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')])
|
||
|
>>> t3.avg_Ao()
|
||
|
1.0
|
||
|
|
||
|
"""
|
||
|
from __future__ import print_function, unicode_literals, division
|
||
|
|
||
|
import logging
|
||
|
from itertools import groupby
|
||
|
from operator import itemgetter
|
||
|
|
||
|
from six import iteritems
|
||
|
|
||
|
from nltk.probability import FreqDist, ConditionalFreqDist
|
||
|
from nltk.internals import deprecated
|
||
|
from nltk.compat import python_2_unicode_compatible
|
||
|
|
||
|
from nltk.metrics.distance import binary_distance
|
||
|
|
||
|
log = logging.getLogger(__file__)
|
||
|
|
||
|
@python_2_unicode_compatible
|
||
|
class AnnotationTask(object):
|
||
|
"""Represents an annotation task, i.e. people assign labels to items.
|
||
|
|
||
|
Notation tries to match notation in Artstein and Poesio (2007).
|
||
|
|
||
|
In general, coders and items can be represented as any hashable object.
|
||
|
Integers, for example, are fine, though strings are more readable.
|
||
|
Labels must support the distance functions applied to them, so e.g.
|
||
|
a string-edit-distance makes no sense if your labels are integers,
|
||
|
whereas interval distance needs numeric values. A notable case of this
|
||
|
is the MASI metric, which requires Python sets.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, data=None, distance=binary_distance):
|
||
|
"""Initialize an annotation task.
|
||
|
|
||
|
The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
|
||
|
each representing a coder's labeling of an item:
|
||
|
(coder,item,label)
|
||
|
|
||
|
The distance argument is a function taking two arguments (labels) and producing a numerical distance.
|
||
|
The distance from a label to itself should be zero:
|
||
|
distance(l,l) = 0
|
||
|
"""
|
||
|
self.distance = distance
|
||
|
self.I = set()
|
||
|
self.K = set()
|
||
|
self.C = set()
|
||
|
self.data = []
|
||
|
if data is not None:
|
||
|
self.load_array(data)
|
||
|
|
||
|
def __str__(self):
|
||
|
return "\r\n".join(map(lambda x:"%s\t%s\t%s" %
|
||
|
(x['coder'], x['item'].replace('_', "\t"),
|
||
|
",".join(x['labels'])), self.data))
|
||
|
|
||
|
def load_array(self, array):
|
||
|
"""Load an sequence of annotation results, appending to any data already loaded.
|
||
|
|
||
|
The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
|
||
|
(coder,item,label)
|
||
|
"""
|
||
|
for coder, item, labels in array:
|
||
|
self.C.add(coder)
|
||
|
self.K.add(labels)
|
||
|
self.I.add(item)
|
||
|
self.data.append({'coder':coder, 'labels':labels, 'item':item})
|
||
|
|
||
|
def agr(self, cA, cB, i, data=None):
|
||
|
"""Agreement between two coders on a given item
|
||
|
|
||
|
"""
|
||
|
data = data or self.data
|
||
|
# cfedermann: we don't know what combination of coder/item will come
|
||
|
# first in x; to avoid StopIteration problems due to assuming an order
|
||
|
# cA,cB, we allow either for k1 and then look up the missing as k2.
|
||
|
k1 = next((x for x in data if x['coder'] in (cA,cB) and x['item']==i))
|
||
|
if k1['coder'] == cA:
|
||
|
k2 = next((x for x in data if x['coder']==cB and x['item']==i))
|
||
|
else:
|
||
|
k2 = next((x for x in data if x['coder']==cA and x['item']==i))
|
||
|
|
||
|
ret = 1.0 - float(self.distance(k1['labels'], k2['labels']))
|
||
|
log.debug("Observed agreement between %s and %s on %s: %f",
|
||
|
cA, cB, i, ret)
|
||
|
log.debug("Distance between \"%r\" and \"%r\": %f",
|
||
|
k1['labels'], k2['labels'], 1.0 - ret)
|
||
|
return ret
|
||
|
|
||
|
def Nk(self, k):
|
||
|
return float(sum(1 for x in self.data if x['labels'] == k))
|
||
|
|
||
|
def Nik(self, i, k):
|
||
|
return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k))
|
||
|
|
||
|
def Nck(self, c, k):
|
||
|
return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k))
|
||
|
|
||
|
@deprecated('Use Nk, Nik or Nck instead')
|
||
|
def N(self, k=None, i=None, c=None):
|
||
|
"""Implements the "n-notation" used in Artstein and Poesio (2007)
|
||
|
|
||
|
"""
|
||
|
if k is not None and i is None and c is None:
|
||
|
ret = self.Nk(k)
|
||
|
elif k is not None and i is not None and c is None:
|
||
|
ret = self.Nik(i, k)
|
||
|
elif k is not None and c is not None and i is None:
|
||
|
ret = self.Nck(c, k)
|
||
|
else:
|
||
|
raise ValueError("You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c))
|
||
|
log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret)
|
||
|
return ret
|
||
|
|
||
|
def _grouped_data(self, field, data=None):
|
||
|
data = data or self.data
|
||
|
return groupby(sorted(data, key=itemgetter(field)), itemgetter(field))
|
||
|
|
||
|
def Ao(self, cA, cB):
|
||
|
"""Observed agreement between two coders on all items.
|
||
|
|
||
|
"""
|
||
|
data = self._grouped_data('item', (x for x in self.data if x['coder'] in (cA, cB)))
|
||
|
ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len(self.I)
|
||
|
log.debug("Observed agreement between %s and %s: %f", cA, cB, ret)
|
||
|
return ret
|
||
|
|
||
|
def _pairwise_average(self, function):
|
||
|
"""
|
||
|
Calculates the average of function results for each coder pair
|
||
|
"""
|
||
|
total = 0
|
||
|
n = 0
|
||
|
s = self.C.copy()
|
||
|
for cA in self.C:
|
||
|
s.remove(cA)
|
||
|
for cB in s:
|
||
|
total += function(cA, cB)
|
||
|
n += 1
|
||
|
ret = total / n
|
||
|
return ret
|
||
|
|
||
|
def avg_Ao(self):
|
||
|
"""Average observed agreement across all coders and items.
|
||
|
|
||
|
"""
|
||
|
ret = self._pairwise_average(self.Ao)
|
||
|
log.debug("Average observed agreement: %f", ret)
|
||
|
return ret
|
||
|
|
||
|
def Do_alpha(self):
|
||
|
"""The observed disagreement for the alpha coefficient.
|
||
|
|
||
|
The alpha coefficient, unlike the other metrics, uses this rather than
|
||
|
observed agreement.
|
||
|
"""
|
||
|
total = 0.0
|
||
|
for i, itemdata in self._grouped_data('item'):
|
||
|
label_freqs = FreqDist(x['labels'] for x in itemdata)
|
||
|
|
||
|
for j, nj in iteritems(label_freqs):
|
||
|
for l, nl in iteritems(label_freqs):
|
||
|
total += float(nj * nl) * self.distance(l, j)
|
||
|
ret = (1.0 / (len(self.I) * len(self.C) * (len(self.C) - 1))) * total
|
||
|
log.debug("Observed disagreement: %f", ret)
|
||
|
return ret
|
||
|
|
||
|
def Do_Kw_pairwise(self,cA,cB,max_distance=1.0):
|
||
|
"""The observed disagreement for the weighted kappa coefficient.
|
||
|
|
||
|
"""
|
||
|
total = 0.0
|
||
|
data = (x for x in self.data if x['coder'] in (cA, cB))
|
||
|
for i, itemdata in self._grouped_data('item', data):
|
||
|
# we should have two items; distance doesn't care which comes first
|
||
|
total += self.distance(next(itemdata)['labels'],
|
||
|
next(itemdata)['labels'])
|
||
|
|
||
|
ret = total / (len(self.I) * max_distance)
|
||
|
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
|
||
|
return ret
|
||
|
|
||
|
def Do_Kw(self, max_distance=1.0):
|
||
|
"""Averaged over all labelers
|
||
|
|
||
|
"""
|
||
|
ret = self._pairwise_average(lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance))
|
||
|
log.debug("Observed disagreement: %f", ret)
|
||
|
return ret
|
||
|
|
||
|
# Agreement Coefficients
|
||
|
def S(self):
|
||
|
"""Bennett, Albert and Goldstein 1954
|
||
|
|
||
|
"""
|
||
|
Ae = 1.0 / len(self.K)
|
||
|
ret = (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||
|
return ret
|
||
|
|
||
|
def pi(self):
|
||
|
"""Scott 1955; here, multi-pi.
|
||
|
Equivalent to K from Siegel and Castellan (1988).
|
||
|
|
||
|
"""
|
||
|
total = 0.0
|
||
|
label_freqs = FreqDist(x['labels'] for x in self.data)
|
||
|
for k, f in iteritems(label_freqs):
|
||
|
total += f ** 2
|
||
|
Ae = total / ((len(self.I) * len(self.C)) ** 2)
|
||
|
return (self.avg_Ao() - Ae) / (1 - Ae)
|
||
|
|
||
|
def Ae_kappa(self, cA, cB):
|
||
|
Ae = 0.0
|
||
|
nitems = float(len(self.I))
|
||
|
label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
|
||
|
for k in label_freqs.conditions():
|
||
|
Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
|
||
|
return Ae
|
||
|
|
||
|
def kappa_pairwise(self, cA, cB):
|
||
|
"""
|
||
|
|
||
|
"""
|
||
|
Ae = self.Ae_kappa(cA, cB)
|
||
|
ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae)
|
||
|
log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae)
|
||
|
return ret
|
||
|
|
||
|
def kappa(self):
|
||
|
"""Cohen 1960
|
||
|
Averages naively over kappas for each coder pair.
|
||
|
|
||
|
"""
|
||
|
return self._pairwise_average(self.kappa_pairwise)
|
||
|
|
||
|
def multi_kappa(self):
|
||
|
"""Davies and Fleiss 1982
|
||
|
Averages over observed and expected agreements for each coder pair.
|
||
|
|
||
|
"""
|
||
|
Ae = self._pairwise_average(self.Ae_kappa)
|
||
|
return (self.avg_Ao() - Ae) / (1.0 - Ae)
|
||
|
|
||
|
def alpha(self):
|
||
|
"""Krippendorff 1980
|
||
|
|
||
|
"""
|
||
|
# check for degenerate cases
|
||
|
if len(self.K)==0:
|
||
|
raise ValueError("Cannot calculate alpha, no data present!")
|
||
|
if len(self.K) == 1:
|
||
|
log.debug("Only one annotation value, allpha returning 1.")
|
||
|
return 1
|
||
|
if len(self.C)==1 and len(self.I) == 1:
|
||
|
raise ValueError("Cannot calculate alpha, only one coder and item present!")
|
||
|
|
||
|
De = 0.0
|
||
|
|
||
|
label_freqs = FreqDist(x['labels'] for x in self.data)
|
||
|
for j in self.K:
|
||
|
nj = label_freqs[j]
|
||
|
for l in self.K:
|
||
|
De += float(nj * label_freqs[l]) * self.distance(j, l)
|
||
|
try:
|
||
|
De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De
|
||
|
log.debug("Expected disagreement: %f", De)
|
||
|
ret = 1.0 - (self.Do_alpha() / De)
|
||
|
except ZeroDivisionError:
|
||
|
raise ValueError("Cannot calculate alpha, expected disagreement zero, check the distance function!")
|
||
|
return ret
|
||
|
|
||
|
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
|
||
|
"""Cohen 1968
|
||
|
|
||
|
"""
|
||
|
total = 0.0
|
||
|
label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
|
||
|
for x in self.data
|
||
|
if x['coder'] in (cA, cB))
|
||
|
for j in self.K:
|
||
|
for l in self.K:
|
||
|
total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
|
||
|
De = total / (max_distance * pow(len(self.I), 2))
|
||
|
log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
|
||
|
Do = self.Do_Kw_pairwise(cA, cB)
|
||
|
ret = 1.0 - (Do / De)
|
||
|
return ret
|
||
|
|
||
|
def weighted_kappa(self, max_distance=1.0):
|
||
|
"""Cohen 1968
|
||
|
|
||
|
"""
|
||
|
return self._pairwise_average(lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
|
||
|
import re
|
||
|
import optparse
|
||
|
from nltk.metrics import distance
|
||
|
|
||
|
# process command-line arguments
|
||
|
parser = optparse.OptionParser()
|
||
|
parser.add_option("-d", "--distance", dest="distance", default="binary_distance",
|
||
|
help="distance metric to use")
|
||
|
parser.add_option("-a", "--agreement", dest="agreement", default="kappa",
|
||
|
help="agreement coefficient to calculate")
|
||
|
parser.add_option("-e", "--exclude", dest="exclude", action="append",
|
||
|
default=[], help="coder names to exclude (may be specified multiple times)")
|
||
|
parser.add_option("-i", "--include", dest="include", action="append", default=[],
|
||
|
help="coder names to include, same format as exclude")
|
||
|
parser.add_option("-f", "--file", dest="file",
|
||
|
help="file to read labelings from, each line with three columns: 'labeler item labels'")
|
||
|
parser.add_option("-v", "--verbose", dest="verbose", default='0',
|
||
|
help="how much debugging to print on stderr (0-4)")
|
||
|
parser.add_option("-c", "--columnsep", dest="columnsep", default="\t",
|
||
|
help="char/string that separates the three columns in the file, defaults to tab")
|
||
|
parser.add_option("-l", "--labelsep", dest="labelsep", default=",",
|
||
|
help="char/string that separates labels (if labelers can assign more than one), defaults to comma")
|
||
|
parser.add_option("-p", "--presence", dest="presence", default=None,
|
||
|
help="convert each labeling into 1 or 0, based on presence of LABEL")
|
||
|
parser.add_option("-T", "--thorough", dest="thorough", default=False, action="store_true",
|
||
|
help="calculate agreement for every subset of the annotators")
|
||
|
(options, remainder) = parser.parse_args()
|
||
|
|
||
|
if not options.file:
|
||
|
parser.print_help()
|
||
|
exit()
|
||
|
|
||
|
logging.basicConfig(level=50 - 10 * int(options.verbose))
|
||
|
|
||
|
# read in data from the specified file
|
||
|
data = []
|
||
|
with open(options.file, 'r') as infile:
|
||
|
for l in infile:
|
||
|
toks = l.split(options.columnsep)
|
||
|
coder, object_, labels = toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep))
|
||
|
if ((options.include == options.exclude) or
|
||
|
(len(options.include) > 0 and coder in options.include) or
|
||
|
(len(options.exclude) > 0 and coder not in options.exclude)):
|
||
|
data.append((coder, object_, labels))
|
||
|
|
||
|
if options.presence:
|
||
|
task = AnnotationTask(data, getattr(distance, options.distance)(options.presence))
|
||
|
else:
|
||
|
task = AnnotationTask(data, getattr(distance, options.distance))
|
||
|
|
||
|
if options.thorough:
|
||
|
pass
|
||
|
else:
|
||
|
print(getattr(task, options.agreement)())
|
||
|
|
||
|
logging.shutdown()
|