textblob.tokenizers — TextBlob 0.19.0 documentation (original) (raw)

Source code for textblob.tokenizers

"""Various tokenizer implementations.

.. versionadded:: 0.4.0 """

from itertools import chain

import nltk

from textblob.base import BaseTokenizer from textblob.decorators import requires_nltk_corpus from textblob.utils import strip_punc

[docs] class WordTokenizer(BaseTokenizer): """NLTK's recommended word tokenizer (currently the TreeBankTokenizer). Uses regular expressions to tokenize text. Assumes text has already been segmented into sentences.

Performs the following steps:

* split standard contractions, e.g. don't -> do n't
* split commas and single quotes
* separate periods that appear at the end of line
"""

[docs] def tokenize(self, text, include_punc=True): """Return a list of word tokens.

    :param text: string of text.
    :param include_punc: (optional) whether to
        include punctuation as separate tokens. Default to True.
    """
    tokens = nltk.tokenize.word_tokenize(text)
    if include_punc:
        return tokens
    else:
        # Return each word token
        # Strips punctuation unless the word comes from a contraction
        # e.g. "Let's" => ["Let", "'s"]
        # e.g. "Can't" => ["Ca", "n't"]
        # e.g. "home." => ['home']
        return [
            word if word.startswith("'") else strip_punc(word, all=False)
            for word in tokens
            if strip_punc(word, all=False)
        ]

[docs] class SentenceTokenizer(BaseTokenizer): """NLTK's sentence tokenizer (currently PunktSentenceTokenizer). Uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences, then uses that to find sentence boundaries. """

[docs] @requires_nltk_corpus def tokenize(self, text): """Return a list of sentences.""" return nltk.tokenize.sent_tokenize(text)

#: Convenience function for tokenizing sentences sent_tokenize = SentenceTokenizer().itokenize

_word_tokenizer = WordTokenizer() # Singleton word tokenizer

[docs] def word_tokenize(text, include_punc=True, *args, **kwargs): """Convenience function for tokenizing text into words.

NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
tokenized to sentences before being tokenized to words.
"""
words = chain.from_iterable(
    _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs)
    for sentence in sent_tokenize(text)
)
return words