# -*- coding: utf-8 -*- # Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2018 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # Contributors: matthewmc, clouds56 # URL: # For license information, see LICENSE.TXT r""" NLTK Tokenizer Package Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string: >>> from nltk.tokenize import word_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> word_tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] This particular tokenizer requires the Punkt sentence tokenization models to be installed. NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation: >>> from nltk.tokenize import wordpunct_tokenize >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] We can also operate at the level of sentences, using the sentence tokenizer directly as follows: >>> from nltk.tokenize import sent_tokenize, word_tokenize >>> sent_tokenize(s) ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] >>> [word_tokenize(t) for t in sent_tokenize(s)] [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] Caution: when tokenizing a Unicode string, make sure you are not using an encoded version of the string (it may be necessary to decode it first, e.g. with ``s.decode("utf8")``. NLTK tokenizers can produce token-spans, represented as tuples of integers having the same semantics as string slices, to support efficient comparison of tokenizers. (These methods are implemented as generators.) >>> from nltk.tokenize import WhitespaceTokenizer >>> list(WhitespaceTokenizer().span_tokenize(s)) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] There are numerous ways to tokenize text. If you need more control over tokenization, see the other methods provided in this package. For further information, please see Chapter 3 of the NLTK book. """ import re from nltk.data import load from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize) from nltk.tokenize.mwe import MWETokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.regexp import (RegexpTokenizer, WhitespaceTokenizer, BlanklineTokenizer, WordPunctTokenizer, wordpunct_tokenize, regexp_tokenize, blankline_tokenize) from nltk.tokenize.repp import ReppTokenizer from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer, line_tokenize) from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tokenize.toktok import ToktokTokenizer from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize from nltk.tokenize.stanford_segmenter import StanfordSegmenter # Standard sentence tokenizer. def sent_tokenize(text, language='english'): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language)) return tokenizer.tokenize(text) # Standard word tokenizer. _treebank_word_tokenizer = TreebankWordTokenizer() # See discussion on https://github.com/nltk/nltk/pull/1437 # Adding to TreebankWordTokenizer, the splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' improved_open_quote_regex = re.compile(u'([«“‘„]|[`]+)', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) _treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) _treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) _treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) def word_tokenize(text, language='english', preserve_line=False): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently an improved :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into words :type text: str :param language: the model name in the Punkt corpus :type language: str :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it. :type preserver_line: bool """ sentences = [text] if preserve_line else sent_tokenize(text, language) return [token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)]