276 lines
11 KiB
Text
276 lines
11 KiB
Text
.. Copyright (C) 2001-2018 NLTK Project
|
|
.. For license information, see LICENSE.TXT
|
|
|
|
==============
|
|
Collocations
|
|
==============
|
|
|
|
Overview
|
|
~~~~~~~~
|
|
|
|
Collocations are expressions of multiple words which commonly co-occur. For
|
|
example, the top ten bigram collocations in Genesis are listed below, as
|
|
measured using Pointwise Mutual Information.
|
|
|
|
>>> import nltk
|
|
>>> from nltk.collocations import *
|
|
>>> bigram_measures = nltk.collocations.BigramAssocMeasures()
|
|
>>> trigram_measures = nltk.collocations.TrigramAssocMeasures()
|
|
>>> finder = BigramCollocationFinder.from_words(
|
|
... nltk.corpus.genesis.words('english-web.txt'))
|
|
>>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
|
|
[(u'Allon', u'Bacuth'), (u'Ashteroth', u'Karnaim'), (u'Ben', u'Ammi'),
|
|
(u'En', u'Mishpat'), (u'Jegar', u'Sahadutha'), (u'Salt', u'Sea'),
|
|
(u'Whoever', u'sheds'), (u'appoint', u'overseers'), (u'aromatic', u'resin'),
|
|
(u'cutting', u'instrument')]
|
|
|
|
While these words are highly collocated, the expressions are also very
|
|
infrequent. Therefore it is useful to apply filters, such as ignoring all
|
|
bigrams which occur less than three times in the corpus:
|
|
|
|
>>> finder.apply_freq_filter(3)
|
|
>>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
|
|
[(u'Beer', u'Lahai'), (u'Lahai', u'Roi'), (u'gray', u'hairs'),
|
|
(u'Most', u'High'), (u'ewe', u'lambs'), (u'many', u'colors'),
|
|
(u'burnt', u'offering'), (u'Paddan', u'Aram'), (u'east', u'wind'),
|
|
(u'living', u'creature')]
|
|
|
|
We may similarly find collocations among tagged words:
|
|
|
|
>>> finder = BigramCollocationFinder.from_words(
|
|
... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
|
|
>>> finder.nbest(bigram_measures.pmi, 5) # doctest: +NORMALIZE_WHITESPACE
|
|
[(('1,119', 'NUM'), ('votes', 'NOUN')),
|
|
(('1962', 'NUM'), ("governor's", 'NOUN')),
|
|
(('637', 'NUM'), ('E.', 'NOUN')),
|
|
(('Alpharetta', 'NOUN'), ('prison', 'NOUN')),
|
|
(('Bar', 'NOUN'), ('Association', 'NOUN'))]
|
|
|
|
Or tags alone:
|
|
|
|
>>> finder = BigramCollocationFinder.from_words(t for w, t in
|
|
... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
|
|
>>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE
|
|
[('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'),
|
|
('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')]
|
|
|
|
Or spanning intervening words:
|
|
|
|
>>> finder = BigramCollocationFinder.from_words(
|
|
... nltk.corpus.genesis.words('english-web.txt'),
|
|
... window_size = 20)
|
|
>>> finder.apply_freq_filter(2)
|
|
>>> ignored_words = nltk.corpus.stopwords.words('english')
|
|
>>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
|
|
>>> finder.nbest(bigram_measures.likelihood_ratio, 10) # doctest: +NORMALIZE_WHITESPACE
|
|
[(u'chief', u'chief'), (u'became', u'father'), (u'years', u'became'),
|
|
(u'hundred', u'years'), (u'lived', u'became'), (u'king', u'king'),
|
|
(u'lived', u'years'), (u'became', u'became'), (u'chief', u'chiefs'),
|
|
(u'hundred', u'became')]
|
|
|
|
Finders
|
|
~~~~~~~
|
|
|
|
The collocations package provides collocation finders which by default
|
|
consider all ngrams in a text as candidate collocations:
|
|
|
|
>>> text = "I do not like green eggs and ham, I do not like them Sam I am!"
|
|
>>> tokens = nltk.wordpunct_tokenize(text)
|
|
>>> finder = BigramCollocationFinder.from_words(tokens)
|
|
>>> scored = finder.score_ngrams(bigram_measures.raw_freq)
|
|
>>> sorted(bigram for bigram, score in scored) # doctest: +NORMALIZE_WHITESPACE
|
|
[(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'),
|
|
('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'),
|
|
('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'),
|
|
('them', 'Sam')]
|
|
|
|
We could otherwise construct the collocation finder from manually-derived
|
|
FreqDists:
|
|
|
|
>>> word_fd = nltk.FreqDist(tokens)
|
|
>>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
|
|
>>> finder = BigramCollocationFinder(word_fd, bigram_fd)
|
|
>>> scored == finder.score_ngrams(bigram_measures.raw_freq)
|
|
True
|
|
|
|
A similar interface is provided for trigrams:
|
|
|
|
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
|
>>> scored = finder.score_ngrams(trigram_measures.raw_freq)
|
|
>>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens))
|
|
True
|
|
|
|
We may want to select only the top n results:
|
|
|
|
>>> sorted(finder.nbest(trigram_measures.raw_freq, 2))
|
|
[('I', 'do', 'not'), ('do', 'not', 'like')]
|
|
|
|
Alternatively, we can select those above a minimum score value:
|
|
|
|
>>> sorted(finder.above_score(trigram_measures.raw_freq,
|
|
... 1.0 / len(tuple(nltk.trigrams(tokens)))))
|
|
[('I', 'do', 'not'), ('do', 'not', 'like')]
|
|
|
|
Now spanning intervening words:
|
|
|
|
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
|
>>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4)
|
|
>>> sorted(finder.nbest(trigram_measures.raw_freq, 4))
|
|
[('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')]
|
|
|
|
A closer look at the finder's ngram frequencies:
|
|
|
|
>>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] # doctest: +NORMALIZE_WHITESPACE
|
|
[(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2),
|
|
(('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1),
|
|
((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1),
|
|
(('Sam', 'I', 'am'), 1)]
|
|
|
|
|
|
Filtering candidates
|
|
~~~~~~~~~~~~~~~~~~~~
|
|
|
|
All the ngrams in a text are often too many to be useful when finding
|
|
collocations. It is generally useful to remove some words or punctuation,
|
|
and to require a minimum frequency for candidate collocations.
|
|
|
|
Given our sample text above, if we remove all trigrams containing personal
|
|
pronouns from candidature, score_ngrams should return 6 less results, and
|
|
'do not like' will be the only candidate which occurs more than once:
|
|
|
|
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
|
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
|
14
|
|
>>> finder.apply_word_filter(lambda w: w in ('I', 'me'))
|
|
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
|
8
|
|
>>> sorted(finder.above_score(trigram_measures.raw_freq,
|
|
... 1.0 / len(tuple(nltk.trigrams(tokens)))))
|
|
[('do', 'not', 'like')]
|
|
|
|
Sometimes a filter is a function on the whole ngram, rather than each word,
|
|
such as if we may permit 'and' to appear in the middle of a trigram, but
|
|
not on either edge:
|
|
|
|
>>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3))
|
|
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
|
6
|
|
|
|
Finally, it is often important to remove low frequency candidates, as we
|
|
lack sufficient evidence about their significance as collocations:
|
|
|
|
>>> finder.apply_freq_filter(2)
|
|
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
|
1
|
|
|
|
Association measures
|
|
~~~~~~~~~~~~~~~~~~~~
|
|
|
|
A number of measures are available to score collocations or other associations.
|
|
The arguments to measure functions are marginals of a contingency table, in the
|
|
bigram case (n_ii, (n_ix, n_xi), n_xx)::
|
|
|
|
w1 ~w1
|
|
------ ------
|
|
w2 | n_ii | n_oi | = n_xi
|
|
------ ------
|
|
~w2 | n_io | n_oo |
|
|
------ ------
|
|
= n_ix TOTAL = n_xx
|
|
|
|
We test their calculation using some known values presented in Manning and
|
|
Schutze's text and other papers.
|
|
|
|
Student's t: examples from Manning and Schutze 5.3.2
|
|
|
|
>>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668))
|
|
0.9999
|
|
>>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668))
|
|
4.4721
|
|
|
|
Chi-square: examples from Manning and Schutze 5.3.3
|
|
|
|
>>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668))
|
|
1.55
|
|
>>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007))
|
|
456400
|
|
|
|
Likelihood ratios: examples from Dunning, CL, 1993
|
|
|
|
>>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777))
|
|
270.72
|
|
>>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777))
|
|
95.29
|
|
|
|
Pointwise Mutual Information: examples from Manning and Schutze 5.4
|
|
|
|
>>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668))
|
|
18.38
|
|
>>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668))
|
|
0.29
|
|
|
|
TODO: Find authoritative results for trigrams.
|
|
|
|
Using contingency table values
|
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
While frequency counts make marginals readily available for collocation
|
|
finding, it is common to find published contingency table values. The
|
|
collocations package therefore provides a wrapper, ContingencyMeasures, which
|
|
wraps an association measures class, providing association measures which
|
|
take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the
|
|
bigram case.
|
|
|
|
>>> from nltk.metrics import ContingencyMeasures
|
|
>>> cont_bigram_measures = ContingencyMeasures(bigram_measures)
|
|
>>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740))
|
|
95.29
|
|
>>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173))
|
|
1.55
|
|
|
|
Ranking and correlation
|
|
~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
It is useful to consider the results of finding collocations as a ranking, and
|
|
the rankings output using different association measures can be compared using
|
|
the Spearman correlation coefficient.
|
|
|
|
Ranks can be assigned to a sorted list of results trivially by assigning
|
|
strictly increasing ranks to each result:
|
|
|
|
>>> from nltk.metrics.spearman import *
|
|
>>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5']
|
|
>>> print(list(ranks_from_sequence(results_list)))
|
|
[('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)]
|
|
|
|
If scores are available for each result, we may allow sufficiently similar
|
|
results (differing by no more than rank_gap) to be assigned the same rank:
|
|
|
|
>>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0),
|
|
... ('item4', 35.0), ('item5', 14.0)]
|
|
>>> print(list(ranks_from_scores(results_scored, rank_gap=5)))
|
|
[('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)]
|
|
|
|
The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing
|
|
two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates
|
|
exact opposite rankings.
|
|
|
|
>>> print('%0.1f' % spearman_correlation(
|
|
... ranks_from_sequence(results_list),
|
|
... ranks_from_sequence(results_list)))
|
|
1.0
|
|
>>> print('%0.1f' % spearman_correlation(
|
|
... ranks_from_sequence(reversed(results_list)),
|
|
... ranks_from_sequence(results_list)))
|
|
-1.0
|
|
>>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4']
|
|
>>> print('%0.1f' % spearman_correlation(
|
|
... ranks_from_sequence(results_list),
|
|
... ranks_from_sequence(results_list2)))
|
|
0.6
|
|
>>> print('%0.1f' % spearman_correlation(
|
|
... ranks_from_sequence(reversed(results_list)),
|
|
... ranks_from_sequence(results_list2)))
|
|
-0.6
|
|
|
|
|