415 lines
15 KiB
Python
415 lines
15 KiB
Python
|
# Natural Language Toolkit: Ngram Association Measures
|
||
|
#
|
||
|
# Copyright (C) 2001-2018 NLTK Project
|
||
|
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
|
||
|
# URL: <http://nltk.org>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
Provides scoring functions for a number of association measures through a
|
||
|
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
|
||
|
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
|
||
|
"""
|
||
|
|
||
|
from __future__ import division
|
||
|
from abc import ABCMeta, abstractmethod
|
||
|
from six import add_metaclass
|
||
|
import math as _math
|
||
|
from functools import reduce
|
||
|
_log2 = lambda x: _math.log(x, 2.0)
|
||
|
_ln = _math.log
|
||
|
|
||
|
_product = lambda s: reduce(lambda x, y: x * y, s)
|
||
|
|
||
|
_SMALL = 1e-20
|
||
|
|
||
|
try:
|
||
|
from scipy.stats import fisher_exact
|
||
|
except ImportError:
|
||
|
def fisher_exact(*_args, **_kwargs):
|
||
|
raise NotImplementedError
|
||
|
|
||
|
### Indices to marginals arguments:
|
||
|
|
||
|
NGRAM = 0
|
||
|
"""Marginals index for the ngram count"""
|
||
|
|
||
|
UNIGRAMS = -2
|
||
|
"""Marginals index for a tuple of each unigram count"""
|
||
|
|
||
|
TOTAL = -1
|
||
|
"""Marginals index for the number of words in the data"""
|
||
|
|
||
|
|
||
|
@add_metaclass(ABCMeta)
|
||
|
class NgramAssocMeasures(object):
|
||
|
"""
|
||
|
An abstract class defining a collection of generic association measures.
|
||
|
Each public method returns a score, taking the following arguments::
|
||
|
|
||
|
score_fn(count_of_ngram,
|
||
|
(count_of_n-1gram_1, ..., count_of_n-1gram_j),
|
||
|
(count_of_n-2gram_1, ..., count_of_n-2gram_k),
|
||
|
...,
|
||
|
(count_of_1gram_1, ..., count_of_1gram_n),
|
||
|
count_of_total_words)
|
||
|
|
||
|
See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
|
||
|
|
||
|
Inheriting classes should define a property _n, and a method _contingency
|
||
|
which calculates contingency values from marginals in order for all
|
||
|
association measures defined here to be usable.
|
||
|
"""
|
||
|
|
||
|
_n = 0
|
||
|
|
||
|
@staticmethod
|
||
|
@abstractmethod
|
||
|
def _contingency(*marginals):
|
||
|
"""Calculates values of a contingency table from marginal values."""
|
||
|
raise NotImplementedError("The contingency table is not available"
|
||
|
"in the general ngram case")
|
||
|
|
||
|
@staticmethod
|
||
|
@abstractmethod
|
||
|
def _marginals(*contingency):
|
||
|
"""Calculates values of contingency table marginals from its values."""
|
||
|
raise NotImplementedError("The contingency table is not available"
|
||
|
"in the general ngram case")
|
||
|
|
||
|
@classmethod
|
||
|
def _expected_values(cls, cont):
|
||
|
"""Calculates expected values for a contingency table."""
|
||
|
n_all = sum(cont)
|
||
|
bits = [1 << i for i in range(cls._n)]
|
||
|
|
||
|
# For each contingency table cell
|
||
|
for i in range(len(cont)):
|
||
|
# Yield the expected value
|
||
|
yield (_product(sum(cont[x] for x in range(2 ** cls._n)
|
||
|
if (x & j) == (i & j))
|
||
|
for j in bits) /
|
||
|
(n_all ** (cls._n - 1)))
|
||
|
|
||
|
@staticmethod
|
||
|
def raw_freq(*marginals):
|
||
|
"""Scores ngrams by their frequency"""
|
||
|
return marginals[NGRAM] / marginals[TOTAL]
|
||
|
|
||
|
@classmethod
|
||
|
def student_t(cls, *marginals):
|
||
|
"""Scores ngrams using Student's t test with independence hypothesis
|
||
|
for unigrams, as in Manning and Schutze 5.3.1.
|
||
|
"""
|
||
|
return ((marginals[NGRAM] -
|
||
|
_product(marginals[UNIGRAMS]) /
|
||
|
(marginals[TOTAL] ** (cls._n - 1))) /
|
||
|
(marginals[NGRAM] + _SMALL) ** .5)
|
||
|
|
||
|
@classmethod
|
||
|
def chi_sq(cls, *marginals):
|
||
|
"""Scores ngrams using Pearson's chi-square as in Manning and Schutze
|
||
|
5.3.3.
|
||
|
"""
|
||
|
cont = cls._contingency(*marginals)
|
||
|
exps = cls._expected_values(cont)
|
||
|
return sum((obs - exp) ** 2 / (exp + _SMALL)
|
||
|
for obs, exp in zip(cont, exps))
|
||
|
|
||
|
@staticmethod
|
||
|
def mi_like(*marginals, **kwargs):
|
||
|
"""Scores ngrams using a variant of mutual information. The keyword
|
||
|
argument power sets an exponent (default 3) for the numerator. No
|
||
|
logarithm of the result is calculated.
|
||
|
"""
|
||
|
return (marginals[NGRAM] ** kwargs.get('power', 3) /
|
||
|
_product(marginals[UNIGRAMS]))
|
||
|
|
||
|
@classmethod
|
||
|
def pmi(cls, *marginals):
|
||
|
"""Scores ngrams by pointwise mutual information, as in Manning and
|
||
|
Schutze 5.4.
|
||
|
"""
|
||
|
return (_log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) -
|
||
|
_log2(_product(marginals[UNIGRAMS])))
|
||
|
|
||
|
@classmethod
|
||
|
def likelihood_ratio(cls, *marginals):
|
||
|
"""Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
|
||
|
"""
|
||
|
cont = cls._contingency(*marginals)
|
||
|
return (cls._n *
|
||
|
sum(obs * _ln(obs / (exp + _SMALL) + _SMALL)
|
||
|
for obs, exp in zip(cont, cls._expected_values(cont))))
|
||
|
|
||
|
@classmethod
|
||
|
def poisson_stirling(cls, *marginals):
|
||
|
"""Scores ngrams using the Poisson-Stirling measure."""
|
||
|
exp = (_product(marginals[UNIGRAMS]) /
|
||
|
(marginals[TOTAL] ** (cls._n - 1)))
|
||
|
return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
|
||
|
|
||
|
@classmethod
|
||
|
def jaccard(cls, *marginals):
|
||
|
"""Scores ngrams using the Jaccard index."""
|
||
|
cont = cls._contingency(*marginals)
|
||
|
return cont[0] / sum(cont[:-1])
|
||
|
|
||
|
|
||
|
class BigramAssocMeasures(NgramAssocMeasures):
|
||
|
"""
|
||
|
A collection of bigram association measures. Each association measure
|
||
|
is provided as a function with three arguments::
|
||
|
|
||
|
bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
|
||
|
|
||
|
The arguments constitute the marginals of a contingency table, counting
|
||
|
the occurrences of particular events in a corpus. The letter i in the
|
||
|
suffix refers to the appearance of the word in question, while x indicates
|
||
|
the appearance of any word. Thus, for example:
|
||
|
|
||
|
n_ii counts (w1, w2), i.e. the bigram being scored
|
||
|
n_ix counts (w1, *)
|
||
|
n_xi counts (*, w2)
|
||
|
n_xx counts (*, *), i.e. any bigram
|
||
|
|
||
|
This may be shown with respect to a contingency table::
|
||
|
|
||
|
w1 ~w1
|
||
|
------ ------
|
||
|
w2 | n_ii | n_oi | = n_xi
|
||
|
------ ------
|
||
|
~w2 | n_io | n_oo |
|
||
|
------ ------
|
||
|
= n_ix TOTAL = n_xx
|
||
|
"""
|
||
|
|
||
|
_n = 2
|
||
|
|
||
|
@staticmethod
|
||
|
def _contingency(n_ii, n_ix_xi_tuple, n_xx):
|
||
|
"""Calculates values of a bigram contingency table from marginal values."""
|
||
|
(n_ix, n_xi) = n_ix_xi_tuple
|
||
|
n_oi = n_xi - n_ii
|
||
|
n_io = n_ix - n_ii
|
||
|
return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
|
||
|
|
||
|
@staticmethod
|
||
|
def _marginals(n_ii, n_oi, n_io, n_oo):
|
||
|
"""Calculates values of contingency table marginals from its values."""
|
||
|
return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
|
||
|
|
||
|
@staticmethod
|
||
|
def _expected_values(cont):
|
||
|
"""Calculates expected values for a contingency table."""
|
||
|
n_xx = sum(cont)
|
||
|
# For each contingency table cell
|
||
|
for i in range(4):
|
||
|
yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
|
||
|
|
||
|
@classmethod
|
||
|
def phi_sq(cls, *marginals):
|
||
|
"""Scores bigrams using phi-square, the square of the Pearson correlation
|
||
|
coefficient.
|
||
|
"""
|
||
|
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||
|
|
||
|
return ((n_ii*n_oo - n_io*n_oi)**2 /
|
||
|
((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))
|
||
|
|
||
|
@classmethod
|
||
|
def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
|
||
|
"""Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
|
||
|
of bigrams, as in Manning and Schutze 5.3.3.
|
||
|
"""
|
||
|
(n_ix, n_xi) = n_ix_xi_tuple
|
||
|
return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
|
||
|
|
||
|
@classmethod
|
||
|
def fisher(cls, *marginals):
|
||
|
"""Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
|
||
|
sensitive to small counts than PMI or Chi Sq, but also more expensive
|
||
|
to compute. Requires scipy.
|
||
|
"""
|
||
|
|
||
|
n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
|
||
|
|
||
|
(odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
|
||
|
return pvalue
|
||
|
|
||
|
@staticmethod
|
||
|
def dice(n_ii, n_ix_xi_tuple, n_xx):
|
||
|
"""Scores bigrams using Dice's coefficient."""
|
||
|
(n_ix, n_xi) = n_ix_xi_tuple
|
||
|
return 2 * n_ii / (n_ix + n_xi)
|
||
|
|
||
|
|
||
|
class TrigramAssocMeasures(NgramAssocMeasures):
|
||
|
"""
|
||
|
A collection of trigram association measures. Each association measure
|
||
|
is provided as a function with four arguments::
|
||
|
|
||
|
trigram_score_fn(n_iii,
|
||
|
(n_iix, n_ixi, n_xii),
|
||
|
(n_ixx, n_xix, n_xxi),
|
||
|
n_xxx)
|
||
|
|
||
|
The arguments constitute the marginals of a contingency table, counting
|
||
|
the occurrences of particular events in a corpus. The letter i in the
|
||
|
suffix refers to the appearance of the word in question, while x indicates
|
||
|
the appearance of any word. Thus, for example:
|
||
|
n_iii counts (w1, w2, w3), i.e. the trigram being scored
|
||
|
n_ixx counts (w1, *, *)
|
||
|
n_xxx counts (*, *, *), i.e. any trigram
|
||
|
"""
|
||
|
|
||
|
_n = 3
|
||
|
|
||
|
@staticmethod
|
||
|
def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
|
||
|
"""Calculates values of a trigram contingency table (or cube) from
|
||
|
marginal values.
|
||
|
>>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
|
||
|
(1, 0, 0, 0, 0, 72, 0, 1927)
|
||
|
"""
|
||
|
(n_iix, n_ixi, n_xii) = n_iix_tuple
|
||
|
(n_ixx, n_xix, n_xxi) = n_ixx_tuple
|
||
|
n_oii = n_xii - n_iii
|
||
|
n_ioi = n_ixi - n_iii
|
||
|
n_iio = n_iix - n_iii
|
||
|
n_ooi = n_xxi - n_iii - n_oii - n_ioi
|
||
|
n_oio = n_xix - n_iii - n_oii - n_iio
|
||
|
n_ioo = n_ixx - n_iii - n_ioi - n_iio
|
||
|
n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
|
||
|
|
||
|
return (n_iii, n_oii, n_ioi, n_ooi,
|
||
|
n_iio, n_oio, n_ioo, n_ooo)
|
||
|
|
||
|
@staticmethod
|
||
|
def _marginals(*contingency):
|
||
|
"""Calculates values of contingency table marginals from its values.
|
||
|
>>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
|
||
|
(1, (1, 1, 1), (1, 73, 1), 2000)
|
||
|
"""
|
||
|
n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
|
||
|
return (n_iii,
|
||
|
(n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
|
||
|
(n_iii + n_ioi + n_iio + n_ioo,
|
||
|
n_iii + n_oii + n_iio + n_oio,
|
||
|
n_iii + n_oii + n_ioi + n_ooi),
|
||
|
sum(contingency))
|
||
|
|
||
|
|
||
|
class QuadgramAssocMeasures(NgramAssocMeasures):
|
||
|
"""
|
||
|
A collection of quadgram association measures. Each association measure
|
||
|
is provided as a function with five arguments::
|
||
|
|
||
|
trigram_score_fn(n_iiii,
|
||
|
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||
|
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||
|
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||
|
n_all)
|
||
|
|
||
|
The arguments constitute the marginals of a contingency table, counting
|
||
|
the occurrences of particular events in a corpus. The letter i in the
|
||
|
suffix refers to the appearance of the word in question, while x indicates
|
||
|
the appearance of any word. Thus, for example:
|
||
|
n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
|
||
|
n_ixxi counts (w1, *, *, w4)
|
||
|
n_xxxx counts (*, *, *, *), i.e. any quadgram
|
||
|
"""
|
||
|
|
||
|
_n = 4
|
||
|
|
||
|
@staticmethod
|
||
|
def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
|
||
|
"""Calculates values of a quadgram contingency table from
|
||
|
marginal values.
|
||
|
"""
|
||
|
(n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
|
||
|
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
|
||
|
(n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
|
||
|
n_oiii = n_xiii - n_iiii
|
||
|
n_ioii = n_ixii - n_iiii
|
||
|
n_iioi = n_iixi - n_iiii
|
||
|
n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
|
||
|
n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
|
||
|
n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
|
||
|
n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
|
||
|
n_iiio = n_iiix - n_iiii
|
||
|
n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
|
||
|
n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
|
||
|
n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
|
||
|
n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
|
||
|
n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
|
||
|
n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
|
||
|
n_oooo = n_xxxx - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_oioi - n_iooi - \
|
||
|
n_oooi - n_iiio - n_oiio - n_ioio - n_ooio - n_iioo - n_oioo - n_iooo
|
||
|
|
||
|
return (n_iiii, n_oiii, n_ioii, n_ooii, n_iioi,
|
||
|
n_oioi, n_iooi, n_oooi, n_iiio, n_oiio,
|
||
|
n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo)
|
||
|
|
||
|
@staticmethod
|
||
|
def _marginals(*contingency):
|
||
|
"""Calculates values of contingency table marginals from its values.
|
||
|
QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
|
||
|
(1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
|
||
|
"""
|
||
|
n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, \
|
||
|
n_iioo, n_oioo, n_iooo, n_oooo = contingency
|
||
|
|
||
|
n_iiix = n_iiii + n_iiio
|
||
|
n_iixi = n_iiii + n_iioi
|
||
|
n_ixii = n_iiii + n_ioii
|
||
|
n_xiii = n_iiii + n_oiii
|
||
|
|
||
|
n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
|
||
|
n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
|
||
|
n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
|
||
|
n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
|
||
|
n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
|
||
|
n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
|
||
|
|
||
|
n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
|
||
|
n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
|
||
|
n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
|
||
|
n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
|
||
|
|
||
|
n_all = sum(contingency)
|
||
|
|
||
|
return (n_iiii,
|
||
|
(n_iiix, n_iixi, n_ixii, n_xiii),
|
||
|
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
|
||
|
(n_ixxx, n_xixx, n_xxix, n_xxxi),
|
||
|
n_all)
|
||
|
|
||
|
|
||
|
class ContingencyMeasures(object):
|
||
|
"""Wraps NgramAssocMeasures classes such that the arguments of association
|
||
|
measures are contingency table values rather than marginals.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, measures):
|
||
|
"""Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
|
||
|
self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
|
||
|
for k in dir(measures):
|
||
|
if k.startswith('__'):
|
||
|
continue
|
||
|
v = getattr(measures, k)
|
||
|
if not k.startswith('_'):
|
||
|
v = self._make_contingency_fn(measures, v)
|
||
|
setattr(self, k, v)
|
||
|
|
||
|
@staticmethod
|
||
|
def _make_contingency_fn(measures, old_fn):
|
||
|
"""From an association measure function, produces a new function which
|
||
|
accepts contingency table values as its arguments.
|
||
|
"""
|
||
|
def res(*contingency):
|
||
|
return old_fn(*measures._marginals(*contingency))
|
||
|
res.__doc__ = old_fn.__doc__
|
||
|
res.__name__ = old_fn.__name__
|
||
|
return res
|