You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
3.6 KiB

4 years ago
  1. # Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. #
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function
  9. from nltk.corpus import (
  10. gutenberg,
  11. genesis,
  12. inaugural,
  13. nps_chat,
  14. webtext,
  15. treebank,
  16. wordnet,
  17. )
  18. from nltk.text import Text
  19. from nltk.probability import FreqDist
  20. from nltk.util import bigrams
  21. print("*** Introductory Examples for the NLTK Book ***")
  22. print("Loading text1, ..., text9 and sent1, ..., sent9")
  23. print("Type the name of the text or sentence to view it.")
  24. print("Type: 'texts()' or 'sents()' to list the materials.")
  25. text1 = Text(gutenberg.words('melville-moby_dick.txt'))
  26. print("text1:", text1.name)
  27. text2 = Text(gutenberg.words('austen-sense.txt'))
  28. print("text2:", text2.name)
  29. text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
  30. print("text3:", text3.name)
  31. text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
  32. print("text4:", text4.name)
  33. text5 = Text(nps_chat.words(), name="Chat Corpus")
  34. print("text5:", text5.name)
  35. text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
  36. print("text6:", text6.name)
  37. text7 = Text(treebank.words(), name="Wall Street Journal")
  38. print("text7:", text7.name)
  39. text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
  40. print("text8:", text8.name)
  41. text9 = Text(gutenberg.words('chesterton-thursday.txt'))
  42. print("text9:", text9.name)
  43. def texts():
  44. print("text1:", text1.name)
  45. print("text2:", text2.name)
  46. print("text3:", text3.name)
  47. print("text4:", text4.name)
  48. print("text5:", text5.name)
  49. print("text6:", text6.name)
  50. print("text7:", text7.name)
  51. print("text8:", text8.name)
  52. print("text9:", text9.name)
  53. sent1 = ["Call", "me", "Ishmael", "."]
  54. sent2 = [
  55. "The",
  56. "family",
  57. "of",
  58. "Dashwood",
  59. "had",
  60. "long",
  61. "been",
  62. "settled",
  63. "in",
  64. "Sussex",
  65. ".",
  66. ]
  67. sent3 = [
  68. "In",
  69. "the",
  70. "beginning",
  71. "God",
  72. "created",
  73. "the",
  74. "heaven",
  75. "and",
  76. "the",
  77. "earth",
  78. ".",
  79. ]
  80. sent4 = [
  81. "Fellow",
  82. "-",
  83. "Citizens",
  84. "of",
  85. "the",
  86. "Senate",
  87. "and",
  88. "of",
  89. "the",
  90. "House",
  91. "of",
  92. "Representatives",
  93. ":",
  94. ]
  95. sent5 = [
  96. "I",
  97. "have",
  98. "a",
  99. "problem",
  100. "with",
  101. "people",
  102. "PMing",
  103. "me",
  104. "to",
  105. "lol",
  106. "JOIN",
  107. ]
  108. sent6 = [
  109. 'SCENE',
  110. '1',
  111. ':',
  112. '[',
  113. 'wind',
  114. ']',
  115. '[',
  116. 'clop',
  117. 'clop',
  118. 'clop',
  119. ']',
  120. 'KING',
  121. 'ARTHUR',
  122. ':',
  123. 'Whoa',
  124. 'there',
  125. '!',
  126. ]
  127. sent7 = [
  128. "Pierre",
  129. "Vinken",
  130. ",",
  131. "61",
  132. "years",
  133. "old",
  134. ",",
  135. "will",
  136. "join",
  137. "the",
  138. "board",
  139. "as",
  140. "a",
  141. "nonexecutive",
  142. "director",
  143. "Nov.",
  144. "29",
  145. ".",
  146. ]
  147. sent8 = [
  148. '25',
  149. 'SEXY',
  150. 'MALE',
  151. ',',
  152. 'seeks',
  153. 'attrac',
  154. 'older',
  155. 'single',
  156. 'lady',
  157. ',',
  158. 'for',
  159. 'discreet',
  160. 'encounters',
  161. '.',
  162. ]
  163. sent9 = [
  164. "THE",
  165. "suburb",
  166. "of",
  167. "Saffron",
  168. "Park",
  169. "lay",
  170. "on",
  171. "the",
  172. "sunset",
  173. "side",
  174. "of",
  175. "London",
  176. ",",
  177. "as",
  178. "red",
  179. "and",
  180. "ragged",
  181. "as",
  182. "a",
  183. "cloud",
  184. "of",
  185. "sunset",
  186. ".",
  187. ]
  188. def sents():
  189. print("sent1:", " ".join(sent1))
  190. print("sent2:", " ".join(sent2))
  191. print("sent3:", " ".join(sent3))
  192. print("sent4:", " ".join(sent4))
  193. print("sent5:", " ".join(sent5))
  194. print("sent6:", " ".join(sent6))
  195. print("sent7:", " ".join(sent7))
  196. print("sent8:", " ".join(sent8))
  197. print("sent9:", " ".join(sent9))