You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

408 lines
14 KiB

4 years ago
  1. # Natural Language Toolkit: Collocations and Association Measures
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Joel Nothman <jnothman@student.usyd.edu.au>
  5. # URL: <http://nltk.org>
  6. # For license information, see LICENSE.TXT
  7. #
  8. """
  9. Tools to identify collocations --- words that often appear consecutively
  10. --- within corpora. They may also be used to find other associations between
  11. word occurrences.
  12. See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
  13. and the Text::NSP Perl package at http://ngram.sourceforge.net
  14. Finding collocations requires first calculating the frequencies of words and
  15. their appearance in the context of other words. Often the collection of words
  16. will then requiring filtering to only retain useful content terms. Each ngram
  17. of words may then be scored according to some association measure, in order
  18. to determine the relative likelihood of each ngram being a collocation.
  19. The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
  20. these functionalities, dependent on being provided a function which scores a
  21. ngram given appropriate frequency counts. A number of standard association
  22. measures are provided in bigram_measures and trigram_measures.
  23. """
  24. from __future__ import print_function
  25. # Possible TODOs:
  26. # - consider the distinction between f(x,_) and f(x) and whether our
  27. # approximation is good enough for fragmented data, and mention it
  28. # - add a n-gram collocation finder with measures which only utilise n-gram
  29. # and unigram counts (raw_freq, pmi, student_t)
  30. import itertools as _itertools
  31. from six import iteritems
  32. from nltk.probability import FreqDist
  33. from nltk.util import ngrams
  34. # these two unused imports are referenced in collocations.doctest
  35. from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
  36. from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
  37. class AbstractCollocationFinder(object):
  38. """
  39. An abstract base class for collocation finders whose purpose is to
  40. collect collocation candidate frequencies, filter and rank them.
  41. As a minimum, collocation finders require the frequencies of each
  42. word in a corpus, and the joint frequency of word tuples. This data
  43. should be provided through nltk.probability.FreqDist objects or an
  44. identical interface.
  45. """
  46. def __init__(self, word_fd, ngram_fd):
  47. self.word_fd = word_fd
  48. self.N = word_fd.N()
  49. self.ngram_fd = ngram_fd
  50. @classmethod
  51. def _build_new_documents(
  52. cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None
  53. ):
  54. '''
  55. Pad the document with the place holder according to the window_size
  56. '''
  57. padding = (pad_symbol,) * (window_size - 1)
  58. if pad_right:
  59. return _itertools.chain.from_iterable(
  60. _itertools.chain(doc, padding) for doc in documents
  61. )
  62. if pad_left:
  63. return _itertools.chain.from_iterable(
  64. _itertools.chain(padding, doc) for doc in documents
  65. )
  66. @classmethod
  67. def from_documents(cls, documents):
  68. """Constructs a collocation finder given a collection of documents,
  69. each of which is a list (or iterable) of tokens.
  70. """
  71. # return cls.from_words(_itertools.chain(*documents))
  72. return cls.from_words(
  73. cls._build_new_documents(documents, cls.default_ws, pad_right=True)
  74. )
  75. @staticmethod
  76. def _ngram_freqdist(words, n):
  77. return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1))
  78. def _apply_filter(self, fn=lambda ngram, freq: False):
  79. """Generic filter removes ngrams from the frequency distribution
  80. if the function returns True when passed an ngram tuple.
  81. """
  82. tmp_ngram = FreqDist()
  83. for ngram, freq in iteritems(self.ngram_fd):
  84. if not fn(ngram, freq):
  85. tmp_ngram[ngram] = freq
  86. self.ngram_fd = tmp_ngram
  87. def apply_freq_filter(self, min_freq):
  88. """Removes candidate ngrams which have frequency less than min_freq."""
  89. self._apply_filter(lambda ng, freq: freq < min_freq)
  90. def apply_ngram_filter(self, fn):
  91. """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
  92. evaluates to True.
  93. """
  94. self._apply_filter(lambda ng, f: fn(*ng))
  95. def apply_word_filter(self, fn):
  96. """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
  97. ...) evaluates to True.
  98. """
  99. self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
  100. def _score_ngrams(self, score_fn):
  101. """Generates of (ngram, score) pairs as determined by the scoring
  102. function provided.
  103. """
  104. for tup in self.ngram_fd:
  105. score = self.score_ngram(score_fn, *tup)
  106. if score is not None:
  107. yield tup, score
  108. def score_ngrams(self, score_fn):
  109. """Returns a sequence of (ngram, score) pairs ordered from highest to
  110. lowest score, as determined by the scoring function provided.
  111. """
  112. return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
  113. def nbest(self, score_fn, n):
  114. """Returns the top n ngrams when scored by the given function."""
  115. return [p for p, s in self.score_ngrams(score_fn)[:n]]
  116. def above_score(self, score_fn, min_score):
  117. """Returns a sequence of ngrams, ordered by decreasing score, whose
  118. scores each exceed the given minimum score.
  119. """
  120. for ngram, score in self.score_ngrams(score_fn):
  121. if score > min_score:
  122. yield ngram
  123. else:
  124. break
  125. class BigramCollocationFinder(AbstractCollocationFinder):
  126. """A tool for the finding and ranking of bigram collocations or other
  127. association measures. It is often useful to use from_words() rather than
  128. constructing an instance directly.
  129. """
  130. default_ws = 2
  131. def __init__(self, word_fd, bigram_fd, window_size=2):
  132. """Construct a BigramCollocationFinder, given FreqDists for
  133. appearances of words and (possibly non-contiguous) bigrams.
  134. """
  135. AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
  136. self.window_size = window_size
  137. @classmethod
  138. def from_words(cls, words, window_size=2):
  139. """Construct a BigramCollocationFinder for all bigrams in the given
  140. sequence. When window_size > 2, count non-contiguous bigrams, in the
  141. style of Church and Hanks's (1990) association ratio.
  142. """
  143. wfd = FreqDist()
  144. bfd = FreqDist()
  145. if window_size < 2:
  146. raise ValueError("Specify window_size at least 2")
  147. for window in ngrams(words, window_size, pad_right=True):
  148. w1 = window[0]
  149. if w1 is None:
  150. continue
  151. wfd[w1] += 1
  152. for w2 in window[1:]:
  153. if w2 is not None:
  154. bfd[(w1, w2)] += 1
  155. return cls(wfd, bfd, window_size=window_size)
  156. def score_ngram(self, score_fn, w1, w2):
  157. """Returns the score for a given bigram using the given scoring
  158. function. Following Church and Hanks (1990), counts are scaled by
  159. a factor of 1/(window_size - 1).
  160. """
  161. n_all = self.N
  162. n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
  163. if not n_ii:
  164. return
  165. n_ix = self.word_fd[w1]
  166. n_xi = self.word_fd[w2]
  167. return score_fn(n_ii, (n_ix, n_xi), n_all)
  168. class TrigramCollocationFinder(AbstractCollocationFinder):
  169. """A tool for the finding and ranking of trigram collocations or other
  170. association measures. It is often useful to use from_words() rather than
  171. constructing an instance directly.
  172. """
  173. default_ws = 3
  174. def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
  175. """Construct a TrigramCollocationFinder, given FreqDists for
  176. appearances of words, bigrams, two words with any word between them,
  177. and trigrams.
  178. """
  179. AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
  180. self.wildcard_fd = wildcard_fd
  181. self.bigram_fd = bigram_fd
  182. @classmethod
  183. def from_words(cls, words, window_size=3):
  184. """Construct a TrigramCollocationFinder for all trigrams in the given
  185. sequence.
  186. """
  187. if window_size < 3:
  188. raise ValueError("Specify window_size at least 3")
  189. wfd = FreqDist()
  190. wildfd = FreqDist()
  191. bfd = FreqDist()
  192. tfd = FreqDist()
  193. for window in ngrams(words, window_size, pad_right=True):
  194. w1 = window[0]
  195. if w1 is None:
  196. continue
  197. for w2, w3 in _itertools.combinations(window[1:], 2):
  198. wfd[w1] += 1
  199. if w2 is None:
  200. continue
  201. bfd[(w1, w2)] += 1
  202. if w3 is None:
  203. continue
  204. wildfd[(w1, w3)] += 1
  205. tfd[(w1, w2, w3)] += 1
  206. return cls(wfd, bfd, wildfd, tfd)
  207. def bigram_finder(self):
  208. """Constructs a bigram collocation finder with the bigram and unigram
  209. data from this finder. Note that this does not include any filtering
  210. applied to this finder.
  211. """
  212. return BigramCollocationFinder(self.word_fd, self.bigram_fd)
  213. def score_ngram(self, score_fn, w1, w2, w3):
  214. """Returns the score for a given trigram using the given scoring
  215. function.
  216. """
  217. n_all = self.N
  218. n_iii = self.ngram_fd[(w1, w2, w3)]
  219. if not n_iii:
  220. return
  221. n_iix = self.bigram_fd[(w1, w2)]
  222. n_ixi = self.wildcard_fd[(w1, w3)]
  223. n_xii = self.bigram_fd[(w2, w3)]
  224. n_ixx = self.word_fd[w1]
  225. n_xix = self.word_fd[w2]
  226. n_xxi = self.word_fd[w3]
  227. return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all)
  228. class QuadgramCollocationFinder(AbstractCollocationFinder):
  229. """A tool for the finding and ranking of quadgram collocations or other association measures.
  230. It is often useful to use from_words() rather than constructing an instance directly.
  231. """
  232. default_ws = 4
  233. def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
  234. """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
  235. bigrams, trigrams, two words with one word and two words between them, three words
  236. with a word between them in both variations.
  237. """
  238. AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
  239. self.iii = iii
  240. self.ii = ii
  241. self.ixi = ixi
  242. self.ixxi = ixxi
  243. self.iixi = iixi
  244. self.ixii = ixii
  245. @classmethod
  246. def from_words(cls, words, window_size=4):
  247. if window_size < 4:
  248. raise ValueError("Specify window_size at least 4")
  249. ixxx = FreqDist()
  250. iiii = FreqDist()
  251. ii = FreqDist()
  252. iii = FreqDist()
  253. ixi = FreqDist()
  254. ixxi = FreqDist()
  255. iixi = FreqDist()
  256. ixii = FreqDist()
  257. for window in ngrams(words, window_size, pad_right=True):
  258. w1 = window[0]
  259. if w1 is None:
  260. continue
  261. for w2, w3, w4 in _itertools.combinations(window[1:], 3):
  262. ixxx[w1] += 1
  263. if w2 is None:
  264. continue
  265. ii[(w1, w2)] += 1
  266. if w3 is None:
  267. continue
  268. iii[(w1, w2, w3)] += 1
  269. ixi[(w1, w3)] += 1
  270. if w4 is None:
  271. continue
  272. iiii[(w1, w2, w3, w4)] += 1
  273. ixxi[(w1, w4)] += 1
  274. ixii[(w1, w3, w4)] += 1
  275. iixi[(w1, w2, w4)] += 1
  276. return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
  277. def score_ngram(self, score_fn, w1, w2, w3, w4):
  278. n_all = self.N
  279. n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
  280. if not n_iiii:
  281. return
  282. n_iiix = self.iii[(w1, w2, w3)]
  283. n_xiii = self.iii[(w2, w3, w4)]
  284. n_iixi = self.iixi[(w1, w2, w4)]
  285. n_ixii = self.ixii[(w1, w3, w4)]
  286. n_iixx = self.ii[(w1, w2)]
  287. n_xxii = self.ii[(w3, w4)]
  288. n_xiix = self.ii[(w2, w3)]
  289. n_ixix = self.ixi[(w1, w3)]
  290. n_ixxi = self.ixxi[(w1, w4)]
  291. n_xixi = self.ixi[(w2, w4)]
  292. n_ixxx = self.word_fd[w1]
  293. n_xixx = self.word_fd[w2]
  294. n_xxix = self.word_fd[w3]
  295. n_xxxi = self.word_fd[w4]
  296. return score_fn(
  297. n_iiii,
  298. (n_iiix, n_iixi, n_ixii, n_xiii),
  299. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
  300. (n_ixxx, n_xixx, n_xxix, n_xxxi),
  301. n_all,
  302. )
  303. def demo(scorer=None, compare_scorer=None):
  304. """Finds bigram collocations in the files of the WebText corpus."""
  305. from nltk.metrics import (
  306. BigramAssocMeasures,
  307. spearman_correlation,
  308. ranks_from_scores,
  309. )
  310. if scorer is None:
  311. scorer = BigramAssocMeasures.likelihood_ratio
  312. if compare_scorer is None:
  313. compare_scorer = BigramAssocMeasures.raw_freq
  314. from nltk.corpus import stopwords, webtext
  315. ignored_words = stopwords.words('english')
  316. word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
  317. for file in webtext.fileids():
  318. words = [word.lower() for word in webtext.words(file)]
  319. cf = BigramCollocationFinder.from_words(words)
  320. cf.apply_freq_filter(3)
  321. cf.apply_word_filter(word_filter)
  322. corr = spearman_correlation(
  323. ranks_from_scores(cf.score_ngrams(scorer)),
  324. ranks_from_scores(cf.score_ngrams(compare_scorer)),
  325. )
  326. print(file)
  327. print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
  328. print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
  329. # Slows down loading too much
  330. # bigram_measures = BigramAssocMeasures()
  331. # trigram_measures = TrigramAssocMeasures()
  332. if __name__ == '__main__':
  333. import sys
  334. from nltk.metrics import BigramAssocMeasures
  335. try:
  336. scorer = eval('BigramAssocMeasures.' + sys.argv[1])
  337. except IndexError:
  338. scorer = None
  339. try:
  340. compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
  341. except IndexError:
  342. compare_scorer = None
  343. demo(scorer, compare_scorer)
  344. __all__ = [
  345. 'BigramCollocationFinder',
  346. 'TrigramCollocationFinder',
  347. 'QuadgramCollocationFinder',
  348. ]