textblob.classifiers — TextBlob 0.19.0 documentation (original) (raw)

"""Various classifier implementations. Also includes basic feature extractor methods.

Example Usage: ::

>>> from textblob import TextBlob
>>> from textblob.classifiers import NaiveBayesClassifier
>>> train = [
...     ('I love this sandwich.', 'pos'),
...     ('This is an amazing place!', 'pos'),
...     ('I feel very good about these beers.', 'pos'),
...     ('I do not like this restaurant', 'neg'),
...     ('I am tired of this stuff.', 'neg'),
...     ("I can't deal with this", 'neg'),
...     ("My boss is horrible.", "neg")
... ]
>>> cl = NaiveBayesClassifier(train)
>>> cl.classify("I feel amazing!")
'pos'
>>> blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
>>> for s in blob.sentences:
...     print(s)
...     print(s.classify())
...
The beer is good.
pos
But the hangover is horrible.
neg

.. versionadded:: 0.6.0 """ # noqa: E501

from itertools import chain

import nltk

import textblob.formats as formats from textblob.decorators import cached_property from textblob.exceptions import FormatError from textblob.tokenizers import word_tokenize from textblob.utils import is_filelike, strip_punc

basestring = (str, bytes)

Basic feature extractors

def _get_words_from_dataset(dataset): """Return a set of all words in a dataset.

:param dataset: A list of tuples of the form ``(words, label)`` where
    ``words`` is either a string of a list of tokens.
"""

# Words may be either a string or a list of tokens. Return an iterator
# of tokens accordingly
def tokenize(words):
    if isinstance(words, basestring):
        return word_tokenize(words, include_punc=False)
    else:
        return words

all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
return set(all_words)

def _get_document_tokens(document): if isinstance(document, basestring): tokens = set( strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False) ) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens

CLASSIFIERS

[docs] class BaseClassifier: """Abstract classifier class from which all classifers inherit. At a minimum, descendant classes must implement a classify method and have a classifier property.

:param train_set: The training set, either a list of tuples of the form
    ``(text, classification)`` or a file-like object. ``text`` may be either
    a string or an iterable.
:param callable feature_extractor: A feature extractor function that takes one or
    two arguments: ``document`` and ``train_set``.
:param str format: If ``train_set`` is a filename, the file format, e.g.
    ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
    file format.
:param kwargs: Additional keyword arguments are passed to the constructor
    of the :class:`Format <textblob.formats.BaseFormat>` class used to
    read the data. Only applies when a file-like object is passed as
    ``train_set``.

.. versionadded:: 0.6.0
"""

def __init__(
    self, train_set, feature_extractor=basic_extractor, format=None, **kwargs
):
    self.format_kwargs = kwargs
    self.feature_extractor = feature_extractor
    if is_filelike(train_set):
        self.train_set = self._read_data(train_set, format)
    else:  # train_set is a list of tuples
        self.train_set = train_set
    self._word_set = _get_words_from_dataset(
        self.train_set
    )  # Keep a hidden set of unique words.
    self.train_features = None

def _read_data(self, dataset, format=None):
    """Reads a data file and returns an iterable that can be used
    as testing or training data.
    """
    # Attempt to detect file format if "format" isn't specified
    if not format:
        format_class = formats.detect(dataset)
        if not format_class:
            raise FormatError(
                "Could not automatically detect format for the given data source."
            )
    else:
        registry = formats.get_registry()
        if format not in registry.keys():
            raise ValueError(f"'{format}' format not supported.")
        format_class = registry[format]
    return format_class(dataset, **self.format_kwargs).to_iterable()

@cached_property
def classifier(self):
    """The classifier object."""
    raise NotImplementedError('Must implement the "classifier" property.')

[docs] def classify(self, text): """Classifies a string of text.""" raise NotImplementedError('Must implement a "classify" method.')

[docs] def train(self, labeled_featureset): """Trains the classifier.""" raise NotImplementedError('Must implement a "train" method.')

[docs] def labels(self): """Returns an iterable containing the possible labels.""" raise NotImplementedError('Must implement a "labels" method.')

[docs] class NLTKClassifier(BaseClassifier): """An abstract class that wraps around the nltk.classify module.

Expects that descendant classes include a class variable ``nltk_class``
which is the class in the nltk.classify module to be wrapped.

Example: ::

    class MyClassifier(NLTKClassifier):
        nltk_class = nltk.classify.svm.SvmClassifier
"""

#: The NLTK class to be wrapped. Must be a class within nltk.classify
nltk_class = None

def __init__(
    self, train_set, feature_extractor=basic_extractor, format=None, **kwargs
):
    super().__init__(train_set, feature_extractor, format, **kwargs)
    self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]

def __repr__(self):
    class_name = self.__class__.__name__
    return f"<{class_name} trained on {len(self.train_set)} instances>"

@cached_property
def classifier(self):
    """The classifier."""
    try:
        return self.train()
    except AttributeError as error:  # nltk_class has not been defined
        raise ValueError(
            "NLTKClassifier must have a nltk_class variable that is not None."
        ) from error

[docs] def train(self, *args, **kwargs): """Train the classifier with a labeled feature set and return the classifier. Takes the same arguments as the wrapped NLTK class. This method is implicitly called when calling classify or accuracy methods and is included only to allow passing in arguments to the train method of the wrapped NLTK class.

    .. versionadded:: 0.6.2

    :rtype: A classifier
    """
    try:
        self.classifier = self.nltk_class.train(
            self.train_features, *args, **kwargs
        )
        return self.classifier
    except AttributeError as error:
        raise ValueError(
            "NLTKClassifier must have a nltk_class variable that is not None."
        ) from error

[docs] def labels(self): """Return an iterable of possible labels.""" return self.classifier.labels()

[docs] def classify(self, text): """Classifies the text.

    :param str text: A string of text.
    """
    text_features = self.extract_features(text)
    return self.classifier.classify(text_features)

[docs] def accuracy(self, test_set, format=None): """Compute the accuracy on a test set.

    :param test_set: A list of tuples of the form ``(text, label)``, or a
        file pointer.
    :param format: If ``test_set`` is a filename, the file format, e.g.
        ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
        file format.
    """
    if is_filelike(test_set):
        test_data = self._read_data(test_set, format)
    else:  # test_set is a list of tuples
        test_data = test_set
    test_features = [(self.extract_features(d), c) for d, c in test_data]
    return nltk.classify.accuracy(self.classifier, test_features)

[docs] def update(self, new_data, *args, **kwargs): """Update the classifier with new training data and re-trains the classifier.

    :param new_data: New data as a list of tuples of the form
        ``(text, label)``.
    """
    self.train_set += new_data
    self._word_set.update(_get_words_from_dataset(new_data))
    self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
    try:
        self.classifier = self.nltk_class.train(
            self.train_features, *args, **kwargs
        )
    except AttributeError as error:  # Descendant has not defined nltk_class
        raise ValueError(
            "NLTKClassifier must have a nltk_class variable that is not None."
        ) from error
    return True

[docs] class NaiveBayesClassifier(NLTKClassifier): """A classifier based on the Naive Bayes algorithm, as implemented in NLTK.

:param train_set: The training set, either a list of tuples of the form
    ``(text, classification)`` or a filename. ``text`` may be either
    a string or an iterable.
:param feature_extractor: A feature extractor function that takes one or
    two arguments: ``document`` and ``train_set``.
:param format: If ``train_set`` is a filename, the file format, e.g.
    ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
    file format.

.. versionadded:: 0.6.0
"""

nltk_class = nltk.classify.NaiveBayesClassifier

[docs] def prob_classify(self, text): """Return the label probability distribution for classifying a string of text.

    Example:
    ::

        >>> classifier = NaiveBayesClassifier(train_data)
        >>> prob_dist = classifier.prob_classify("I feel happy this morning.")
        >>> prob_dist.max()
        'positive'
        >>> prob_dist.prob("positive")
        0.7

    :rtype: nltk.probability.DictionaryProbDist
    """
    text_features = self.extract_features(text)
    return self.classifier.prob_classify(text_features)

[docs] def informative_features(self, *args, **kwargs): """Return the most informative features as a list of tuples of the form (feature_name, feature_value).

    :rtype: list
    """
    return self.classifier.most_informative_features(*args, **kwargs)

[docs] def show_informative_features(self, *args, **kwargs): """Displays a listing of the most informative features for this classifier.

    :rtype: None
    """
    return self.classifier.show_most_informative_features(*args, **kwargs)

[docs] class DecisionTreeClassifier(NLTKClassifier): """A classifier based on the decision tree algorithm, as implemented in NLTK.

:param train_set: The training set, either a list of tuples of the form
    ``(text, classification)`` or a filename. ``text`` may be either
    a string or an iterable.
:param feature_extractor: A feature extractor function that takes one or
    two arguments: ``document`` and ``train_set``.
:param format: If ``train_set`` is a filename, the file format, e.g.
    ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
    file format.

.. versionadded:: 0.6.2
"""

nltk_class = nltk.classify.decisiontree.DecisionTreeClassifier

[docs] def pretty_format(self, *args, **kwargs): """Return a string containing a pretty-printed version of this decision tree. Each line in the string corresponds to a single decision tree node or leaf, and indentation is used to display the structure of the tree.

    :rtype: str
    """
    return self.classifier.pretty_format(*args, **kwargs)


# Backwards-compat
pprint = pretty_format

[docs] def pseudocode(self, *args, **kwargs): """Return a string representation of this decision tree that expresses the decisions it makes as a nested set of pseudocode if statements.

    :rtype: str
    """
    return self.classifier.pseudocode(*args, **kwargs)

[docs] class PositiveNaiveBayesClassifier(NLTKClassifier): """A variant of the Naive Bayes Classifier that performs binary classification with partially-labeled training sets, i.e. when only one class is labeled and the other is not. Assuming a prior distribution on the two labels, uses the unlabeled set to estimate the frequencies of the features.

Example usage:
::

    >>> from text.classifiers import PositiveNaiveBayesClassifier
    >>> sports_sentences = ['The team dominated the game',
    ...                   'They lost the ball',
    ...                   'The game was intense',
    ...                   'The goalkeeper catched the ball',
    ...                   'The other team controlled the ball']
    >>> various_sentences = ['The President did not comment',
    ...                        'I lost the keys',
    ...                        'The team won the game',
    ...                        'Sara has two kids',
    ...                        'The ball went off the court',
    ...                        'They had the ball for the whole game',
    ...                        'The show is over']
    >>> classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences,
    ...                                           unlabeled_set=various_sentences)
    >>> classifier.classify("My team lost the game")
    True
    >>> classifier.classify("And now for something completely different.")
    False


:param positive_set: A collection of strings that have the positive label.
:param unlabeled_set: A collection of unlabeled strings.
:param feature_extractor: A feature extractor function.
:param positive_prob_prior: A prior estimate of the probability of the
    label ``True``.

.. versionadded:: 0.7.0
"""

nltk_class = nltk.classify.PositiveNaiveBayesClassifier

def __init__(
    self,
    positive_set,
    unlabeled_set,
    feature_extractor=contains_extractor,
    positive_prob_prior=0.5,
    **kwargs,
):
    self.feature_extractor = feature_extractor
    self.positive_set = positive_set
    self.unlabeled_set = unlabeled_set
    self.positive_features = [self.extract_features(d) for d in self.positive_set]
    self.unlabeled_features = [self.extract_features(d) for d in self.unlabeled_set]
    self.positive_prob_prior = positive_prob_prior

def __repr__(self):
    class_name = self.__class__.__name__
    return (
        f"<{class_name} trained on {len(self.positive_set)} labeled "
        f"and {len(self.unlabeled_set)} unlabeled instances>"
    )

# Override

[docs] def train(self, *args, **kwargs): """Train the classifier with a labeled and unlabeled feature sets and return the classifier. Takes the same arguments as the wrapped NLTK class. This method is implicitly called when calling classify or accuracy methods and is included only to allow passing in arguments to the train method of the wrapped NLTK class.

    :rtype: A classifier
    """
    self.classifier = self.nltk_class.train(
        self.positive_features, self.unlabeled_features, self.positive_prob_prior
    )
    return self.classifier

[docs] def update( self, new_positive_data=None, new_unlabeled_data=None, positive_prob_prior=0.5, *args, **kwargs, ): """Update the classifier with new data and re-trains the classifier.

    :param new_positive_data: List of new, labeled strings.
    :param new_unlabeled_data: List of new, unlabeled strings.
    """
    self.positive_prob_prior = positive_prob_prior
    if new_positive_data:
        self.positive_set += new_positive_data
        self.positive_features += [
            self.extract_features(d) for d in new_positive_data
        ]
    if new_unlabeled_data:
        self.unlabeled_set += new_unlabeled_data
        self.unlabeled_features += [
            self.extract_features(d) for d in new_unlabeled_data
        ]
    self.classifier = self.nltk_class.train(
        self.positive_features,
        self.unlabeled_features,
        self.positive_prob_prior,
        *args,
        **kwargs,
    )
    return True

[docs] class MaxEntClassifier(NLTKClassifier): doc = nltk.classify.MaxentClassifier.doc nltk_class = nltk.classify.MaxentClassifier

[docs] def prob_classify(self, text): """Return the label probability distribution for classifying a string of text.

    Example:
    ::

        >>> classifier = MaxEntClassifier(train_data)
        >>> prob_dist = classifier.prob_classify("I feel happy this morning.")
        >>> prob_dist.max()
        'positive'
        >>> prob_dist.prob("positive")
        0.7

    :rtype: nltk.probability.DictionaryProbDist
    """
    feats = self.extract_features(text)
    return self.classifier.prob_classify(feats)