You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

715 lines
25 KiB

4 years ago
  1. # Natural Language Toolkit: Texts
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. This module brings together a variety of NLTK functionality for
  10. text analysis, and provides simple, interactive interfaces.
  11. Functionality includes: concordancing, collocation discovery,
  12. regular expression search over tokenized strings, and
  13. distributional similarity.
  14. """
  15. from __future__ import print_function, division, unicode_literals, absolute_import
  16. from math import log
  17. from collections import defaultdict, Counter, namedtuple
  18. from functools import reduce
  19. import re
  20. from six import text_type
  21. from nltk.probability import FreqDist
  22. from nltk.probability import ConditionalFreqDist as CFD
  23. from nltk.util import tokenwrap, LazyConcatenation
  24. from nltk.metrics import f_measure, BigramAssocMeasures
  25. from nltk.collocations import BigramCollocationFinder
  26. from nltk.compat import python_2_unicode_compatible
  27. ConcordanceLine = namedtuple(
  28. 'ConcordanceLine',
  29. ['left', 'query', 'right', 'offset', 'left_print', 'right_print', 'line'],
  30. )
  31. class ContextIndex(object):
  32. """
  33. A bidirectional index between words and their 'contexts' in a text.
  34. The context of a word is usually defined to be the words that occur
  35. in a fixed window around the word; but other definitions may also
  36. be used by providing a custom context function.
  37. """
  38. @staticmethod
  39. def _default_context(tokens, i):
  40. """One left token and one right token, normalized to lowercase"""
  41. left = tokens[i - 1].lower() if i != 0 else '*START*'
  42. right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*'
  43. return (left, right)
  44. def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
  45. self._key = key
  46. self._tokens = tokens
  47. if context_func:
  48. self._context_func = context_func
  49. else:
  50. self._context_func = self._default_context
  51. if filter:
  52. tokens = [t for t in tokens if filter(t)]
  53. self._word_to_contexts = CFD(
  54. (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
  55. )
  56. self._context_to_words = CFD(
  57. (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
  58. )
  59. def tokens(self):
  60. """
  61. :rtype: list(str)
  62. :return: The document that this context index was
  63. created from.
  64. """
  65. return self._tokens
  66. def word_similarity_dict(self, word):
  67. """
  68. Return a dictionary mapping from words to 'similarity scores,'
  69. indicating how often these two words occur in the same
  70. context.
  71. """
  72. word = self._key(word)
  73. word_contexts = set(self._word_to_contexts[word])
  74. scores = {}
  75. for w, w_contexts in self._word_to_contexts.items():
  76. scores[w] = f_measure(word_contexts, set(w_contexts))
  77. return scores
  78. def similar_words(self, word, n=20):
  79. scores = defaultdict(int)
  80. for c in self._word_to_contexts[self._key(word)]:
  81. for w in self._context_to_words[c]:
  82. if w != word:
  83. scores[w] += (
  84. self._context_to_words[c][word] * self._context_to_words[c][w]
  85. )
  86. return sorted(scores, key=scores.get, reverse=True)[:n]
  87. def common_contexts(self, words, fail_on_unknown=False):
  88. """
  89. Find contexts where the specified words can all appear; and
  90. return a frequency distribution mapping each context to the
  91. number of times that context was used.
  92. :param words: The words used to seed the similarity search
  93. :type words: str
  94. :param fail_on_unknown: If true, then raise a value error if
  95. any of the given words do not occur at all in the index.
  96. """
  97. words = [self._key(w) for w in words]
  98. contexts = [set(self._word_to_contexts[w]) for w in words]
  99. empty = [words[i] for i in range(len(words)) if not contexts[i]]
  100. common = reduce(set.intersection, contexts)
  101. if empty and fail_on_unknown:
  102. raise ValueError("The following word(s) were not found:", " ".join(words))
  103. elif not common:
  104. # nothing in common -- just return an empty freqdist.
  105. return FreqDist()
  106. else:
  107. fd = FreqDist(
  108. c for w in words for c in self._word_to_contexts[w] if c in common
  109. )
  110. return fd
  111. @python_2_unicode_compatible
  112. class ConcordanceIndex(object):
  113. """
  114. An index that can be used to look up the offset locations at which
  115. a given word occurs in a document.
  116. """
  117. def __init__(self, tokens, key=lambda x: x):
  118. """
  119. Construct a new concordance index.
  120. :param tokens: The document (list of tokens) that this
  121. concordance index was created from. This list can be used
  122. to access the context of a given word occurrence.
  123. :param key: A function that maps each token to a normalized
  124. version that will be used as a key in the index. E.g., if
  125. you use ``key=lambda s:s.lower()``, then the index will be
  126. case-insensitive.
  127. """
  128. self._tokens = tokens
  129. """The document (list of tokens) that this concordance index
  130. was created from."""
  131. self._key = key
  132. """Function mapping each token to an index key (or None)."""
  133. self._offsets = defaultdict(list)
  134. """Dictionary mapping words (or keys) to lists of offset indices."""
  135. # Initialize the index (self._offsets)
  136. for index, word in enumerate(tokens):
  137. word = self._key(word)
  138. self._offsets[word].append(index)
  139. def tokens(self):
  140. """
  141. :rtype: list(str)
  142. :return: The document that this concordance index was
  143. created from.
  144. """
  145. return self._tokens
  146. def offsets(self, word):
  147. """
  148. :rtype: list(int)
  149. :return: A list of the offset positions at which the given
  150. word occurs. If a key function was specified for the
  151. index, then given word's key will be looked up.
  152. """
  153. word = self._key(word)
  154. return self._offsets[word]
  155. def __repr__(self):
  156. return '<ConcordanceIndex for %d tokens (%d types)>' % (
  157. len(self._tokens),
  158. len(self._offsets),
  159. )
  160. def find_concordance(self, word, width=80):
  161. """
  162. Find all concordance lines given the query word.
  163. """
  164. half_width = (width - len(word) - 2) // 2
  165. context = width // 4 # approx number of words of context
  166. # Find the instances of the word to create the ConcordanceLine
  167. concordance_list = []
  168. offsets = self.offsets(word)
  169. if offsets:
  170. for i in offsets:
  171. query_word = self._tokens[i]
  172. # Find the context of query word.
  173. left_context = self._tokens[max(0, i - context) : i]
  174. right_context = self._tokens[i + 1 : i + context]
  175. # Create the pretty lines with the query_word in the middle.
  176. left_print = ' '.join(left_context)[-half_width:]
  177. right_print = ' '.join(right_context)[:half_width]
  178. # The WYSIWYG line of the concordance.
  179. line_print = ' '.join([left_print, query_word, right_print])
  180. # Create the ConcordanceLine
  181. concordance_line = ConcordanceLine(
  182. left_context,
  183. query_word,
  184. right_context,
  185. i,
  186. left_print,
  187. right_print,
  188. line_print,
  189. )
  190. concordance_list.append(concordance_line)
  191. return concordance_list
  192. def print_concordance(self, word, width=80, lines=25):
  193. """
  194. Print concordance lines given the query word.
  195. :param word: The target word
  196. :type word: str
  197. :param lines: The number of lines to display (default=25)
  198. :type lines: int
  199. :param width: The width of each line, in characters (default=80)
  200. :type width: int
  201. :param save: The option to save the concordance.
  202. :type save: bool
  203. """
  204. concordance_list = self.find_concordance(word, width=width)
  205. if not concordance_list:
  206. print("no matches")
  207. else:
  208. lines = min(lines, len(concordance_list))
  209. print("Displaying {} of {} matches:".format(lines, len(concordance_list)))
  210. for i, concordance_line in enumerate(concordance_list[:lines]):
  211. print(concordance_line.line)
  212. class TokenSearcher(object):
  213. """
  214. A class that makes it easier to use regular expressions to search
  215. over tokenized strings. The tokenized string is converted to a
  216. string where tokens are marked with angle brackets -- e.g.,
  217. ``'<the><window><is><still><open>'``. The regular expression
  218. passed to the ``findall()`` method is modified to treat angle
  219. brackets as non-capturing parentheses, in addition to matching the
  220. token boundaries; and to have ``'.'`` not match the angle brackets.
  221. """
  222. def __init__(self, tokens):
  223. self._raw = ''.join('<' + w + '>' for w in tokens)
  224. def findall(self, regexp):
  225. """
  226. Find instances of the regular expression in the text.
  227. The text is a list of tokens, and a regexp pattern to match
  228. a single token must be surrounded by angle brackets. E.g.
  229. >>> from nltk.text import TokenSearcher
  230. >>> print('hack'); from nltk.book import text1, text5, text9
  231. hack...
  232. >>> text5.findall("<.*><.*><bro>")
  233. you rule bro; telling you bro; u twizted bro
  234. >>> text1.findall("<a>(<.*>)<man>")
  235. monied; nervous; dangerous; white; white; white; pious; queer; good;
  236. mature; white; Cape; great; wise; wise; butterless; white; fiendish;
  237. pale; furious; better; certain; complete; dismasted; younger; brave;
  238. brave; brave; brave
  239. >>> text9.findall("<th.*>{3,}")
  240. thread through those; the thought that; that the thing; the thing
  241. that; that that thing; through these than through; them that the;
  242. through the thick; them that they; thought that the
  243. :param regexp: A regular expression
  244. :type regexp: str
  245. """
  246. # preprocess the regular expression
  247. regexp = re.sub(r'\s', '', regexp)
  248. regexp = re.sub(r'<', '(?:<(?:', regexp)
  249. regexp = re.sub(r'>', ')>)', regexp)
  250. regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
  251. # perform the search
  252. hits = re.findall(regexp, self._raw)
  253. # Sanity check
  254. for h in hits:
  255. if not h.startswith('<') and h.endswith('>'):
  256. raise ValueError('Bad regexp for TokenSearcher.findall')
  257. # postprocess the output
  258. hits = [h[1:-1].split('><') for h in hits]
  259. return hits
  260. @python_2_unicode_compatible
  261. class Text(object):
  262. """
  263. A wrapper around a sequence of simple (string) tokens, which is
  264. intended to support initial exploration of texts (via the
  265. interactive console). Its methods perform a variety of analyses
  266. on the text's contexts (e.g., counting, concordancing, collocation
  267. discovery), and display the results. If you wish to write a
  268. program which makes use of these analyses, then you should bypass
  269. the ``Text`` class, and use the appropriate analysis function or
  270. class directly instead.
  271. A ``Text`` is typically initialized from a given document or
  272. corpus. E.g.:
  273. >>> import nltk.corpus
  274. >>> from nltk.text import Text
  275. >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
  276. """
  277. # This defeats lazy loading, but makes things faster. This
  278. # *shouldn't* be necessary because the corpus view *should* be
  279. # doing intelligent caching, but without this it's running slow.
  280. # Look into whether the caching is working correctly.
  281. _COPY_TOKENS = True
  282. def __init__(self, tokens, name=None):
  283. """
  284. Create a Text object.
  285. :param tokens: The source text.
  286. :type tokens: sequence of str
  287. """
  288. if self._COPY_TOKENS:
  289. tokens = list(tokens)
  290. self.tokens = tokens
  291. if name:
  292. self.name = name
  293. elif ']' in tokens[:20]:
  294. end = tokens[:20].index(']')
  295. self.name = " ".join(text_type(tok) for tok in tokens[1:end])
  296. else:
  297. self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
  298. # ////////////////////////////////////////////////////////////
  299. # Support item & slice access
  300. # ////////////////////////////////////////////////////////////
  301. def __getitem__(self, i):
  302. return self.tokens[i]
  303. def __len__(self):
  304. return len(self.tokens)
  305. # ////////////////////////////////////////////////////////////
  306. # Interactive console methods
  307. # ////////////////////////////////////////////////////////////
  308. def concordance(self, word, width=79, lines=25):
  309. """
  310. Prints a concordance for ``word`` with the specified context window.
  311. Word matching is not case-sensitive.
  312. :param word: The target word
  313. :type word: str
  314. :param width: The width of each line, in characters (default=80)
  315. :type width: int
  316. :param lines: The number of lines to display (default=25)
  317. :type lines: int
  318. :seealso: ``ConcordanceIndex``
  319. """
  320. if '_concordance_index' not in self.__dict__:
  321. self._concordance_index = ConcordanceIndex(
  322. self.tokens, key=lambda s: s.lower()
  323. )
  324. return self._concordance_index.print_concordance(word, width, lines)
  325. def concordance_list(self, word, width=79, lines=25):
  326. """
  327. Generate a concordance for ``word`` with the specified context window.
  328. Word matching is not case-sensitive.
  329. :param word: The target word
  330. :type word: str
  331. :param width: The width of each line, in characters (default=80)
  332. :type width: int
  333. :param lines: The number of lines to display (default=25)
  334. :type lines: int
  335. :seealso: ``ConcordanceIndex``
  336. """
  337. if '_concordance_index' not in self.__dict__:
  338. self._concordance_index = ConcordanceIndex(
  339. self.tokens, key=lambda s: s.lower()
  340. )
  341. return self._concordance_index.find_concordance(word, width)[:lines]
  342. def collocation_list(self, num=20, window_size=2):
  343. """
  344. Return collocations derived from the text, ignoring stopwords.
  345. :param num: The maximum number of collocations to return.
  346. :type num: int
  347. :param window_size: The number of tokens spanned by a collocation (default=2)
  348. :type window_size: int
  349. """
  350. if not (
  351. '_collocations' in self.__dict__
  352. and self._num == num
  353. and self._window_size == window_size
  354. ):
  355. self._num = num
  356. self._window_size = window_size
  357. # print("Building collocations list")
  358. from nltk.corpus import stopwords
  359. ignored_words = stopwords.words('english')
  360. finder = BigramCollocationFinder.from_words(self.tokens, window_size)
  361. finder.apply_freq_filter(2)
  362. finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
  363. bigram_measures = BigramAssocMeasures()
  364. self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
  365. return [w1 + ' ' + w2 for w1, w2 in self._collocations]
  366. def collocations(self, num=20, window_size=2):
  367. """
  368. Print collocations derived from the text, ignoring stopwords.
  369. :param num: The maximum number of collocations to print.
  370. :type num: int
  371. :param window_size: The number of tokens spanned by a collocation (default=2)
  372. :type window_size: int
  373. """
  374. collocation_strings = [w1 + ' ' + w2 for w1, w2 in self.collocation_list(num, window_size)]
  375. print(tokenwrap(collocation_strings, separator="; "))
  376. def count(self, word):
  377. """
  378. Count the number of times this word appears in the text.
  379. """
  380. return self.tokens.count(word)
  381. def index(self, word):
  382. """
  383. Find the index of the first occurrence of the word in the text.
  384. """
  385. return self.tokens.index(word)
  386. def readability(self, method):
  387. # code from nltk_contrib.readability
  388. raise NotImplementedError
  389. def similar(self, word, num=20):
  390. """
  391. Distributional similarity: find other words which appear in the
  392. same contexts as the specified word; list most similar words first.
  393. :param word: The word used to seed the similarity search
  394. :type word: str
  395. :param num: The number of words to generate (default=20)
  396. :type num: int
  397. :seealso: ContextIndex.similar_words()
  398. """
  399. if '_word_context_index' not in self.__dict__:
  400. # print('Building word-context index...')
  401. self._word_context_index = ContextIndex(
  402. self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
  403. )
  404. # words = self._word_context_index.similar_words(word, num)
  405. word = word.lower()
  406. wci = self._word_context_index._word_to_contexts
  407. if word in wci.conditions():
  408. contexts = set(wci[word])
  409. fd = Counter(
  410. w
  411. for w in wci.conditions()
  412. for c in wci[w]
  413. if c in contexts and not w == word
  414. )
  415. words = [w for w, _ in fd.most_common(num)]
  416. print(tokenwrap(words))
  417. else:
  418. print("No matches")
  419. def common_contexts(self, words, num=20):
  420. """
  421. Find contexts where the specified words appear; list
  422. most frequent common contexts first.
  423. :param words: The words used to seed the similarity search
  424. :type words: str
  425. :param num: The number of words to generate (default=20)
  426. :type num: int
  427. :seealso: ContextIndex.common_contexts()
  428. """
  429. if '_word_context_index' not in self.__dict__:
  430. # print('Building word-context index...')
  431. self._word_context_index = ContextIndex(
  432. self.tokens, key=lambda s: s.lower()
  433. )
  434. try:
  435. fd = self._word_context_index.common_contexts(words, True)
  436. if not fd:
  437. print("No common contexts were found")
  438. else:
  439. ranked_contexts = [w for w, _ in fd.most_common(num)]
  440. print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))
  441. except ValueError as e:
  442. print(e)
  443. def dispersion_plot(self, words):
  444. """
  445. Produce a plot showing the distribution of the words through the text.
  446. Requires pylab to be installed.
  447. :param words: The words to be plotted
  448. :type words: list(str)
  449. :seealso: nltk.draw.dispersion_plot()
  450. """
  451. from nltk.draw import dispersion_plot
  452. dispersion_plot(self, words)
  453. def generate(self, words):
  454. """
  455. Issues a reminder to users following the book online
  456. """
  457. import warnings
  458. warnings.warn(
  459. 'The generate() method is no longer available.', DeprecationWarning
  460. )
  461. def plot(self, *args):
  462. """
  463. See documentation for FreqDist.plot()
  464. :seealso: nltk.prob.FreqDist.plot()
  465. """
  466. self.vocab().plot(*args)
  467. def vocab(self):
  468. """
  469. :seealso: nltk.prob.FreqDist
  470. """
  471. if "_vocab" not in self.__dict__:
  472. # print("Building vocabulary index...")
  473. self._vocab = FreqDist(self)
  474. return self._vocab
  475. def findall(self, regexp):
  476. """
  477. Find instances of the regular expression in the text.
  478. The text is a list of tokens, and a regexp pattern to match
  479. a single token must be surrounded by angle brackets. E.g.
  480. >>> print('hack'); from nltk.book import text1, text5, text9
  481. hack...
  482. >>> text5.findall("<.*><.*><bro>")
  483. you rule bro; telling you bro; u twizted bro
  484. >>> text1.findall("<a>(<.*>)<man>")
  485. monied; nervous; dangerous; white; white; white; pious; queer; good;
  486. mature; white; Cape; great; wise; wise; butterless; white; fiendish;
  487. pale; furious; better; certain; complete; dismasted; younger; brave;
  488. brave; brave; brave
  489. >>> text9.findall("<th.*>{3,}")
  490. thread through those; the thought that; that the thing; the thing
  491. that; that that thing; through these than through; them that the;
  492. through the thick; them that they; thought that the
  493. :param regexp: A regular expression
  494. :type regexp: str
  495. """
  496. if "_token_searcher" not in self.__dict__:
  497. self._token_searcher = TokenSearcher(self)
  498. hits = self._token_searcher.findall(regexp)
  499. hits = [' '.join(h) for h in hits]
  500. print(tokenwrap(hits, "; "))
  501. # ////////////////////////////////////////////////////////////
  502. # Helper Methods
  503. # ////////////////////////////////////////////////////////////
  504. _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
  505. def _context(self, tokens, i):
  506. """
  507. One left & one right token, both case-normalized. Skip over
  508. non-sentence-final punctuation. Used by the ``ContextIndex``
  509. that is created for ``similar()`` and ``common_contexts()``.
  510. """
  511. # Left context
  512. j = i - 1
  513. while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
  514. j -= 1
  515. left = tokens[j] if j != 0 else '*START*'
  516. # Right context
  517. j = i + 1
  518. while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
  519. j += 1
  520. right = tokens[j] if j != len(tokens) else '*END*'
  521. return (left, right)
  522. # ////////////////////////////////////////////////////////////
  523. # String Display
  524. # ////////////////////////////////////////////////////////////
  525. def __str__(self):
  526. return '<Text: %s>' % self.name
  527. def __repr__(self):
  528. return '<Text: %s>' % self.name
  529. # Prototype only; this approach will be slow to load
  530. class TextCollection(Text):
  531. """A collection of texts, which can be loaded with list of texts, or
  532. with a corpus consisting of one or more texts, and which supports
  533. counting, concordancing, collocation discovery, etc. Initialize a
  534. TextCollection as follows:
  535. >>> import nltk.corpus
  536. >>> from nltk.text import TextCollection
  537. >>> print('hack'); from nltk.book import text1, text2, text3
  538. hack...
  539. >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
  540. >>> mytexts = TextCollection([text1, text2, text3])
  541. Iterating over a TextCollection produces all the tokens of all the
  542. texts in order.
  543. """
  544. def __init__(self, source):
  545. if hasattr(source, 'words'): # bridge to the text corpus reader
  546. source = [source.words(f) for f in source.fileids()]
  547. self._texts = source
  548. Text.__init__(self, LazyConcatenation(source))
  549. self._idf_cache = {}
  550. def tf(self, term, text):
  551. """ The frequency of the term in text. """
  552. return text.count(term) / len(text)
  553. def idf(self, term):
  554. """ The number of texts in the corpus divided by the
  555. number of texts that the term appears in.
  556. If a term does not appear in the corpus, 0.0 is returned. """
  557. # idf values are cached for performance.
  558. idf = self._idf_cache.get(term)
  559. if idf is None:
  560. matches = len([True for text in self._texts if term in text])
  561. if len(self._texts) == 0:
  562. raise ValueError('IDF undefined for empty document collection')
  563. idf = log(len(self._texts) / matches) if matches else 0.0
  564. self._idf_cache[term] = idf
  565. return idf
  566. def tf_idf(self, term, text):
  567. return self.tf(term, text) * self.idf(term)
  568. def demo():
  569. from nltk.corpus import brown
  570. text = Text(brown.words(categories='news'))
  571. print(text)
  572. print()
  573. print("Concordance:")
  574. text.concordance('news')
  575. print()
  576. print("Distributionally similar words:")
  577. text.similar('news')
  578. print()
  579. print("Collocations:")
  580. text.collocations()
  581. print()
  582. # print("Automatically generated text:")
  583. # text.generate()
  584. # print()
  585. print("Dispersion plot:")
  586. text.dispersion_plot(['news', 'report', 'said', 'announced'])
  587. print()
  588. print("Vocabulary plot:")
  589. text.plot(50)
  590. print()
  591. print("Indexing:")
  592. print("text[3]:", text[3])
  593. print("text[3:5]:", text[3:5])
  594. print("text.vocab()['news']:", text.vocab()['news'])
  595. if __name__ == '__main__':
  596. demo()
  597. __all__ = [
  598. "ContextIndex",
  599. "ConcordanceIndex",
  600. "TokenSearcher",
  601. "Text",
  602. "TextCollection",
  603. ]